# Task 1 (Detecting pathological gamblers)
### Using word embeddings

In [1]:
P_FILE = "../posts.csv"
TRAIN_TOKEN="../train_df.csv"
TEST_TOKEN="../test_df.csv"
GENERAL_MODELS="../Models"
ROLLING_WINDOW_SIZE=10
BASELINE_COMP=False
TASK=1
METHOD='mean'
EMBEDDINGS = 'GLOVE_CC'
WORD_EMBEDDINGS = "../../word_embeddings/"
MODEL_PATH =f"{GENERAL_MODELS}/WE/{EMBEDDINGS}_{ROLLING_WINDOW_SIZE}" 
seed=23


In [2]:
from pathlib import Path
Path(MODEL_PATH).mkdir(parents=True, exist_ok=True)

In [3]:
import gensim
import fasttext
we = {"FASTTEXT_CC":"crawl-300d-2M-subword.bin","GLOVE_TT":"glove-twitter-200.bin", "GLOVE_CC":"glove_cc_300d.bin"}


In [4]:
if EMBEDDINGS=='GLOVE_TT':
    embeddings = gensim.models.KeyedVectors.load_word2vec_format(WORD_EMBEDDINGS + "glove-twitter-200.bin", binary=True)
    
elif EMBEDDINGS=='GLOVE_CC':
    embeddings = gensim.models.KeyedVectors.load_word2vec_format(WORD_EMBEDDINGS + "glove_cc_300d.bin", binary=True)

elif EMBEDDINGS=='FASTTEXT_CC':
    embeddings = fasttext.load_model(WORD_EMBEDDINGS + "crawl-300d-2M-subword.bin")


### opening resulting dataset with pandas

In [5]:
import pandas as pd
import numpy as np
import os
np.random.seed(seed)

train_df = pd.read_csv(TRAIN_TOKEN, sep='\t')
test_df = pd.read_csv(TEST_TOKEN, sep='\t')
train_df

Unnamed: 0,User,Post_Nr,Raw,Stemmed,Lemmatized,Label
0,3450,0,"sports betting number k in debt, feeling very ...","sport bet number k in debt , feel veri depress...","sports betting number k in debt , feeling very...",1
1,3450,1,finally accepted that you cannot win gambling ...,final accept that you can not win gambl relaps...,finally accepted that you can not win gambling...,1
2,3450,2,blocking software betfilter has anybody used t...,"block softwar betfilt has anybodi use this , w...",blocking software betfilter has anybody used t...,1
3,3450,3,prone to relapse when in debt? i find that whe...,prone to relaps when in debt ? i find that whe...,prone to relapse when in debt ? i find that wh...,1
4,3450,4,down to my last number on credit card i am num...,down to my last number on credit card i am num...,down to my last number on credit card i am num...,1
...,...,...,...,...,...,...
97853,162,670,you sick fuck,you sick fuck,you sick fuck,0
97854,162,671,reeeeeeeeeeeeeeeeeeeeeeeeeee,reeeeeeeeeeeeeeeeeeeeeeeeeee,reeeeeeeeeeeeeeeeeeeeeeeeeee,0
97855,162,672,this is so clearly satire! i really do not wan...,this is so clear satir ! i realli do not want ...,this is so clearly satire ! i really do not wa...,0
97856,162,673,he cannot write a story in number minutes,he can not write a stori in number minut,he can not write a story in number minutes,0


In [6]:
def rolling_window(df, window_size,stride, field):
    res_map={}
    for user in df['User'].unique():
        user_df = df[df['User']==user]
        res_map[user]=(user_df['Label'].values[0],{})
        posts = user_df[field].values
        iteration=0
        for i in range(0,len(posts),stride):
            res_map[user][1][iteration]=' '.join((posts[i:i+window_size]))
            iteration+=1
    result_df = pd.DataFrame([(k,k1,v1,v[0]) for k,v in res_map.items() for k1,v1 in v[1].items()], columns = ['User','Window_id','Text','Label'])
    
    return result_df

In [7]:
train_df = rolling_window(train_df,ROLLING_WINDOW_SIZE,1,'Lemmatized')
test_df = rolling_window(test_df,ROLLING_WINDOW_SIZE,1,'Lemmatized')

train_df

Unnamed: 0,User,Window_id,Text,Label
0,3450,0,"sports betting number k in debt , feeling very...",1
1,3450,1,finally accepted that you can not win gambling...,1
2,3450,2,blocking software betfilter has anybody used t...,1
3,3450,3,prone to relapse when in debt ? i find that wh...,1
4,3450,4,down to my last number on credit card i am num...,1
...,...,...,...,...
97853,162,670,you sick fuck reeeeeeeeeeeeeeeeeeeeeeeeeee thi...,0
97854,162,671,reeeeeeeeeeeeeeeeeeeeeeeeeee this is so clearl...,0
97855,162,672,this is so clearly satire ! i really do not wa...,0
97856,162,673,he can not write a story in number minutes pay...,0


In [8]:
train_df = pd.concat([train_df,test_df])
train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True) 


In [9]:
import pickle
def save_model(model, name):
    with open(f"{MODEL_PATH}/{name}",'wb') as f:
        pickle.dump( model,f)

## Model part

In [10]:
from tqdm import tqdm
import re
import operator

def build_vocab(sentences):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences):
        for word in re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE):            
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    vocab = dict(sorted(vocab.items(), key=operator.itemgetter(1))[::-1])
    return vocab

In [11]:
vocab_dict = build_vocab(train_df['Text'])
vocab_dict

100%|██████████| 120181/120181 [00:18<00:00, 6360.24it/s]


{'.': 2569175,
 'the': 1436450,
 'i': 1429780,
 ',': 1241392,
 'to': 1145950,
 'a': 971341,
 'and': 964002,
 'number': 795759,
 'is': 760436,
 'you': 747516,
 'it': 691554,
 'of': 645210,
 'not': 613042,
 'in': 543362,
 'that': 523118,
 'for': 455940,
 'have': 395204,
 'my': 349983,
 'on': 326974,
 'this': 317034,
 'are': 304883,
 'but': 292858,
 'do': 290925,
 'was': 284248,
 '?': 281535,
 '!': 265067,
 'with': 245741,
 'be': 236984,
 'can': 228418,
 'am': 221957,
 'your': 212399,
 'if': 206356,
 'so': 205715,
 'me': 203830,
 'just': 201635,
 'will': 190976,
 'as': 183993,
 'they': 182990,
 'at': 181626,
 'like': 172142,
 'what': 163027,
 'or': 160803,
 'all': 158486,
 'would': 153632,
 'out': 149127,
 'from': 140373,
 'get': 138576,
 'he': 135401,
 'we': 133144,
 'there': 132309,
 'about': 128621,
 'up': 128574,
 'one': 125160,
 'money': 121444,
 'more': 118994,
 'time': 117777,
 'url': 117181,
 'when': 113499,
 's': 107261,
 'how': 103989,
 'an': 100886,
 'no': 99491,
 'had': 96923,

In [12]:
import operator

def check_embedding_coverage(token_vocab,embeddings):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(token_vocab):
        try:
            a[word] = embeddings[word]
            k += token_vocab[word]
        except:

            oov[word] = token_vocab[word]
            i += token_vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(token_vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x
    


In [14]:
check_embedding_coverage(vocab_dict, embeddings)


100%|██████████| 64182/64182 [00:00<00:00, 501250.93it/s]

Found embeddings for 88.04% of vocab
Found embeddings for  99.61% of all text





[('thefunnytweeter', 9962),
 ('kwgt', 9128),
 ('bkex', 6972),
 ('covid', 6954),
 ('emojiay', 3260),
 ('faceapp', 2776),
 ('oaycw', 1980),
 ('hadimukri', 1970),
 ('cjqzgo', 1970),
 ('ryzen', 1286),
 ('binance', 1025),
 ('benemon', 730),
 ('megathreads', 680),
 ('patreon', 560),
 ('trulieve', 548),
 ('chasun', 541),
 ('continant', 510),
 ('oneplus', 494),
 ('mihoyo', 490),
 ('elastos', 454),
 ('homescreensetup', 404),
 ('novalaunchersetup', 404),
 ('androidcustomization', 404),
 ('androidsetup', 404),
 ('kwgtsetup', 404),
 ('kwgtwidgets', 404),
 ('androidusers', 404),
 ('androidui', 404),
 ('phonesetup', 404),
 ('novasetup', 404),
 ('androidapp', 404),
 ('instatech', 404),
 ('androidhomescreen', 404),
 ('klwp', 404),
 ('doordash', 392),
 ('emojirivesdk', 380),
 ('finesta', 370),
 ('veromos', 370),
 ('arknights', 370),
 ('gamban', 360),
 ('wolfdoggpr', 330),
 ('vaxxers', 310),
 ('lushen', 310),
 ('verad', 310),
 ('vidacann', 300),
 ('brexit', 295),
 ('chromecast', 280),
 ('theomars', 280)

In [15]:
if 'GLOVE' in EMBEDDINGS:
    embeddings.add_vector('<pad>',np.zeros((embeddings.vector_size)))
    embeddings.add_vector('<unk>',np.mean(embeddings.vectors,axis=0, keepdims=True)[0])
elif EMBEDDINGS=='FASTTEXT':
    pass



In [16]:

if 'GLOVE' in EMBEDDINGS:
    
    train_df['Tokens'] = train_df['Text'].apply(lambda s:[embeddings.key_to_index.get(tok,embeddings.key_to_index['<unk>']) for tok in re.findall(r"\w+|[^\w\s]", s, re.UNICODE)])
    


In [17]:
import numpy as np
def vectorize_post(post, embeddings,method='mean'):
    #Create document vectors by averaging word vectors. Remove out-of-vocabulary words.
    
    vecs = []
    for token in post:
       
        vecs+=[embeddings[token]]
        
        
    if method=='mean':
        return np.mean(vecs, axis=0)
    elif method=='max':
        return np.max(vecs, axis=0)
    elif method=='min':
        return np.min(vecs, axis=0)
        
    


In [18]:
import re

if 'GLOVE' in EMBEDDINGS:
    train_df['Vector'] = train_df['Tokens'].apply(lambda post:vectorize_post(post,embeddings,METHOD))
    #test_df['Vector'] = test_df['Tokens'].apply(lambda post:vectorize_post(post,embeddings,METHOD))

elif 'FASTTEXT' in EMBEDDINGS:
    train_df['Vector'] = train_df['Text'].apply(lambda post:vectorize_post(re.findall(r"\w+|[^\w\s]", post, re.UNICODE),embeddings,METHOD))
    #test_df['Vector'] = test_df['Text'].apply(lambda post:vectorize_post(post.split(" "),embeddings,METHOD))





"\nX_train = pd.DataFrame(train_df['Vector'].values.tolist(), index = train_df.index)\ny_train = train_df['Label']\nX_test = pd.DataFrame(test_df['Vector'].values.tolist(), index = test_df.index)\n\n#X_test = test_df[['User','Window_id','Vector']]\ny_test = test_df['Label']\nX_train\n"

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
#from sklearn.preprocessing import MinMaxScaler


def custom_cv(model, df, n_folds=5,sent =False ,threshold=None):
    skf = StratifiedKFold(n_splits=n_folds)
    user_label_df =df.drop_duplicates('User')
    users = user_label_df['User'].to_numpy()
    
    labels = user_label_df['Label'].to_numpy()
    #print(labels)
    #print(users.shape,labels.shape)
    
    f1_scores = []
    for train_index, test_index in skf.split(users, labels):
        train_users = [users[f] for f in train_index]
        test_users = [users[f] for f in test_index]

        train_folds = df[df['User'].isin(train_users)].copy()
        test_folds = df[df['User'].isin(test_users)].copy()
        #train_folds['Tokens'] = train_folds['Text'].apply(lambda s:[glove_embeddings.key_to_index.get(tok,glove_embeddings.key_to_index['<unk>']) for tok in s.split(" ")])
        #test_folds['Tokens'] = test_folds['Text'].apply(lambda s:[glove_embeddings.key_to_index.get(tok,glove_embeddings.key_to_index['<unk>']) for tok in s.split(" ")])
        #train_folds['Vector'] = train_folds['Tokens'].apply(lambda post:vectorize_post(post,glove_embeddings,METHOD))
        #test_folds['Vector'] = test_folds['Tokens'].apply(lambda post:vectorize_post(post,glove_embeddings,METHOD))
        X_train = pd.DataFrame(train_folds['Vector'].values.tolist(), index = train_folds.index)
        y_train = train_folds['Label']
        X_test = pd.DataFrame(test_folds['Vector'].values.tolist(), index = test_folds.index)
        y_test = test_folds['Label']
        if sent:
            #scaler = MinMaxScaler()
            #train_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']] = scaler.fit_transform(train_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']])
            #test_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']] = scaler.transform(test_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']])
            X_train = np.c_[X_train,train_folds['polarity'],train_folds['subjectivity'],train_folds['negativity'],train_folds['positivity'],train_folds['neutrality'], train_folds['compound']] 
            X_test = np.c_[X_test,test_folds['polarity'],test_folds['subjectivity'],test_folds['negativity'],test_folds['positivity'],test_folds['neutrality'], test_folds['compound']] 
            

        
        
        model.fit(X_train, y_train)
        f1_scores.append(f1_score(y_test,model.predict(X_test)))

    return f1_scores

In [22]:
#X_train = X_train.apply(lambda x: vectorize_post(x, glove_embeddings))
#X_train
#train_df

In [23]:

#X_train[0].shape
#if sentiment:
#    X_train = np.c_[X_train,train_df['polarity'],train_df['subjectivity'],train_df['negativity'],train_df['positivity'],train_df['neutrality'], train_df['compound']] 
#    X_test = np.c_[X_test,test_df['polarity'],test_df['subjectivity'],test_df['negativity'],test_df['positivity'],test_df['neutrality'], test_df['compound']] 



In [24]:
train_df['Vector']

0         [-0.0067410273, 0.022994056, -0.06323666, -0.1...
1         [-0.062390674, 0.05457047, 0.026383927, -0.110...
2         [-0.035383556, 0.027473597, -0.036821682, -0.1...
3         [-0.035688918, 0.056273665, -0.033792444, -0.1...
4         [-0.017818723, 0.03752326, -0.10130639, -0.101...
                                ...                        
120176    [-0.008870341, 0.035673823, -0.041841637, -0.1...
120177    [-0.025309974, 0.04168036, -0.052140344, -0.10...
120178    [-0.050140724, 0.02908735, -0.025017094, -0.17...
120179    [-0.0031832538, 0.02555157, -0.041190375, -0.1...
120180    [-0.01036954, 0.00635363, -0.048583932, -0.117...
Name: Vector, Length: 120181, dtype: object

In [25]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
if BASELINE_COMP:
        models = {
                'sgdLR':SGDClassifier(random_state=seed,loss='log'),
                'sgdlSVM':SGDClassifier(random_state=seed,loss='hinge'),
                'ExtraTrees':ExtraTreesClassifier(random_state=seed,n_jobs=-1),\
                'Perceptron':Perceptron(random_state=seed)}
        report=""
        best_model_name = ""
        best_model=None
        best_f1=0
        for model_name, model in models.items():
                res = custom_cv(model,train_df)
                if np.mean(res) > best_f1:
                        best_f1=np.mean(res)
                        best_model_name=model_name
                        best_model=model
                report+=f"{model_name} f1: {round(np.mean(res),3)}\n"
                print(f"{model_name} f1: {round(np.mean(res),3)}")
        with open(f"{MODEL_PATH}/baseline_report_{EMBEDDINGS}.txt",'w') as f:
                f.write(report)


In [26]:
#if BASELINE_COMP:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
train_df['TB'] = train_df['Text'].apply(lambda text: TextBlob(text).sentiment)
train_df['VADER'] = train_df['Text'].apply(lambda text: sia.polarity_scores(text))


train_df['polarity'] = train_df['TB'].apply(lambda tb: (tb[0]+1/2))
train_df['subjectivity'] = train_df['TB'].apply(lambda tb: tb[1])
train_df['negativity'] = train_df['VADER'].apply(lambda v: v['neg'])
train_df['positivity'] = train_df['VADER'].apply(lambda v: v['pos'])
train_df['neutrality'] = train_df['VADER'].apply(lambda v: v['neu'])
train_df['compound'] = train_df['VADER'].apply(lambda v: (v['compound']+1)/2)



train_df.drop(['VADER','TB'], inplace=True, axis=1)

In [27]:
if BASELINE_COMP:
        res = custom_cv(best_model,train_df, sent=True)
        print(f"{best_model_name} f1: {round(np.mean(res),3)}")
        #cv
        with open(f"{MODEL_PATH}/baseline_report_{EMBEDDINGS}.txt",'a') as f:
                f.write(f"\nBest model with sent:\n{best_model_name} f1: {round(np.mean(res),3)}")

# Hyperparameter tuning

#### Best performing model was sgd Logistic Regression for window size 10, GLOVE_CC with SA features, having achieved a 0.818 F1 score

In [28]:
from tqdm import tqdm
import optuna
import joblib

def train_eval_tuning(trial,params, df, sent=False):

    skf = StratifiedKFold(n_splits=5)
    user_label_df =df.drop_duplicates('User')
    users = user_label_df['User'].to_numpy()
    
    labels = user_label_df['Label'].to_numpy()

    
    f1_scores = []
    for fold,(train_index, test_index) in enumerate(skf.split(users, labels)):
        train_users = [users[f] for f in train_index]
        test_users = [users[f] for f in test_index]

        train_folds = df[df['User'].isin(train_users)]
        X_train = pd.DataFrame(train_folds['Vector'].values.tolist(), index = train_folds.index)

        test_folds = df[df['User'].isin(test_users)]
        X_test = pd.DataFrame(test_folds['Vector'].values.tolist(), index = test_folds.index)

        model = SGDClassifier(**params)
        if sent:
            X_train = np.c_[X_train,train_folds['polarity'],train_folds['subjectivity'],train_folds['negativity'],train_folds['positivity'],train_folds['neutrality'], train_folds['compound']] 
            X_test = np.c_[X_test,test_folds['polarity'],test_folds['subjectivity'],test_folds['negativity'],test_folds['positivity'],test_folds['neutrality'], test_folds['compound']] 
            

        model.fit(X_train, train_folds['Label'])
        f1_scores.append(f1_score(test_folds['Label'],model.predict(X_test)))
    
        trial.report(np.mean(f1_scores), fold)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return f1_scores


In [29]:
def tuning_objective(trial):
    parameters = {
        'max_iter':trial.suggest_int('max_iter',1000,2500,step=500),
        'loss':trial.suggest_categorical('loss',['log']),
        'penalty':trial.suggest_categorical('penalty',['l2','l1','elasticnet']),
        'alpha': trial.suggest_float('alpha',0.00001,0.1,log=True),
        'random_state':trial.suggest_int('random_state',seed,seed)
        
        
    }
    
  
    
    avg_f1 = train_eval_tuning(trial,parameters,train_df, sent=True)
    return np.mean(avg_f1)

In [30]:
study = optuna.create_study(
        study_name=f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}",
        direction='maximize')
study.optimize(tuning_objective, n_trials=50, timeout=(60*60*3))
joblib.dump(study,f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}.pkl")
    #study = optuna.create_study(study_name=f"tfidfvectorizer_cv_{ROLLING_WINDOW_SIZE}",direction="maximize", sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.NopPruner())
study = joblib.load(f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}.pkl")

[32m[I 2022-06-15 18:06:59,501][0m A new study created in memory with name: t1_tuning_10[0m
[32m[I 2022-06-15 18:07:56,284][0m Trial 0 finished with value: 0.6703370650807627 and parameters: {'max_iter': 1000, 'loss': 'log', 'penalty': 'l1', 'alpha': 0.010493375115403552, 'random_state': 23}. Best is trial 0 with value: 0.6703370650807627.[0m
[32m[I 2022-06-15 18:08:48,069][0m Trial 1 finished with value: 0.8028821871916568 and parameters: {'max_iter': 1000, 'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.000543173554137824, 'random_state': 23}. Best is trial 1 with value: 0.8028821871916568.[0m
[32m[I 2022-06-15 18:09:42,532][0m Trial 2 finished with value: 0.7554254504117268 and parameters: {'max_iter': 2500, 'loss': 'log', 'penalty': 'l1', 'alpha': 0.002478740563155032, 'random_state': 23}. Best is trial 1 with value: 0.8028821871916568.[0m
[32m[I 2022-06-15 18:10:49,994][0m Trial 3 finished with value: 0.8010223875891327 and parameters: {'max_iter': 1000, 'loss': '

In [31]:
study.best_trial.params, study.best_value

({'max_iter': 2000,
  'loss': 'log',
  'penalty': 'l2',
  'alpha': 5.6567497467221377e-05,
  'random_state': 23},
 0.8186599122555993)

In [32]:
with open(f"{MODEL_PATH}/baseline_report_{EMBEDDINGS}.txt",'a') as f:
                f.write(f"\nOptimized model f1: {round(study.best_value,3)}\nparams: {study.best_trial.params}")

In [33]:
sgdLR_params = study.best_trial.params

final_model = SGDClassifier(**sgdLR_params)
X_train = pd.DataFrame(train_df['Vector'].values.tolist(), index = train_df.index)

X_train = np.c_[X_train,train_df['polarity'],train_df['subjectivity'],train_df['negativity'],train_df['positivity'],train_df['neutrality'], train_df['compound']] 


final_model.fit(X_train, train_df['Label'])
save_model(final_model, "optimized_sgdLR.pkl")
