# Task 1 (Detecting pathological gamblers)
### Using contextualized language models

In [1]:
P_FILE = "../posts.csv"
TRAIN_TOKEN="../train_df.csv"
TEST_TOKEN="../test_df.csv"
GENERAL_MODELS="../Models"
ROLLING_WINDOW_SIZE=10
LM_MODEL='all-mpnet-base-v2'
#'all-MiniLM-L6-v2'  'all-mpnet-base-v2'
TASK=1
CONVERTED=True
SENT_MEASURED=True
BASELINE_COMP=False
MODEL_PATH =f"{GENERAL_MODELS}/LM/win_{ROLLING_WINDOW_SIZE}_{LM_MODEL}" 
max_lengths={1:64,3:128,5:256,10:512}



In [2]:
from pathlib import Path
Path(MODEL_PATH).mkdir(parents=True, exist_ok=True)

### opening resulting dataset with pandas

In [3]:
def rolling_window(df, window_size,stride, field):
    res_map={}
    for user in df['User'].unique():
        user_df = df[df['User']==user]
        res_map[user]=(user_df['Label'].values[0],{})
        posts = user_df[field].values
        iteration=0
        for i in range(0,len(posts),stride):
            res_map[user][1][iteration]=' '.join((posts[i:i+window_size]))
            iteration+=1
    result_df = pd.DataFrame([(k,k1,v1,v[0]) for k,v in res_map.items() for k1,v1 in v[1].items()], columns = ['User','Window_id','Text','Label'])
    
    return result_df

In [4]:
import pickle
#Store sentences & embeddings on disc
def save_embeddings(filepath, embeddings):
    with open(filepath, "wb") as fOut:
        pickle.dump({ 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

#Load sentences & embeddings from disc
def load_embeddings(filepath):
    with open(filepath, "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_embeddings = stored_data['embeddings']
    return stored_embeddings   

In [5]:
import pandas as pd
import numpy as np
import os
seed=23
np.random.seed(seed)


if not SENT_MEASURED:
    train_df = pd.read_csv(TRAIN_TOKEN, sep='\t')
    test_df = pd.read_csv(TEST_TOKEN, sep='\t')
    train_df = rolling_window(train_df,ROLLING_WINDOW_SIZE,1,'Raw')
    test_df = rolling_window(test_df,ROLLING_WINDOW_SIZE,1,'Raw')

    from textblob import TextBlob
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    sia = SentimentIntensityAnalyzer()

    train_df['TB'] = train_df['Text'].apply(lambda text: TextBlob(text).sentiment)
    train_df['VADER'] = train_df['Text'].apply(lambda text: sia.polarity_scores(text))
    train_df['polarity'] = train_df['TB'].apply(lambda tb: (tb[0]+1/2))
    train_df['subjectivity'] = train_df['TB'].apply(lambda tb: tb[1])
    train_df['negativity'] = train_df['VADER'].apply(lambda v: v['neg'])
    train_df['positivity'] = train_df['VADER'].apply(lambda v: v['pos'])
    train_df['neutrality'] = train_df['VADER'].apply(lambda v: v['neu'])
    train_df['compound'] = train_df['VADER'].apply(lambda v: (v['compound']+1)/2)
    train_df.drop(['VADER','TB'], inplace=True, axis=1)

    test_df['TB'] = test_df['Text'].apply(lambda text: TextBlob(text).sentiment)
    test_df['VADER'] = test_df['Text'].apply(lambda text: sia.polarity_scores(text))
    test_df['polarity'] = test_df['TB'].apply(lambda tb: (tb[0]+1/2))
    test_df['subjectivity'] = test_df['TB'].apply(lambda tb: tb[1])
    test_df['negativity'] = test_df['VADER'].apply(lambda v: v['neg'])
    test_df['positivity'] = test_df['VADER'].apply(lambda v: v['pos'])
    test_df['neutrality'] = test_df['VADER'].apply(lambda v: v['neu'])
    test_df['compound'] = test_df['VADER'].apply(lambda v: (v['compound']+1)/2)
    test_df.drop(['VADER','TB'], inplace=True, axis=1)

    train_df.to_pickle(f"train_df_{ROLLING_WINDOW_SIZE}_sent.pkl")
    test_df.to_pickle(f"test_df_{ROLLING_WINDOW_SIZE}_sent.pkl")
train_df = pd.read_pickle(f"train_df_{ROLLING_WINDOW_SIZE}_sent.pkl")
test_df = pd.read_pickle(f"test_df_{ROLLING_WINDOW_SIZE}_sent.pkl")


In [6]:
train_df

Unnamed: 0,User,Window_id,Text,Label,polarity,subjectivity,negativity,positivity,neutrality,compound
0,3450,0,"sports betting number k in debt, feeling very ...",1,0.480570,0.443716,0.215,0.168,0.617,0.00215
1,3450,1,finally accepted that you cannot win gambling ...,1,0.470277,0.503152,0.220,0.179,0.601,0.00430
2,3450,2,blocking software betfilter has anybody used t...,1,0.462110,0.516745,0.218,0.193,0.589,0.00975
3,3450,3,prone to relapse when in debt? i find that whe...,1,0.480744,0.531990,0.194,0.221,0.585,0.40110
4,3450,4,down to my last number on credit card i am num...,1,0.444203,0.531141,0.206,0.223,0.571,0.09575
...,...,...,...,...,...,...,...,...,...,...
97853,162,670,you sick fuck reeeeeeeeeeeeeeeeeeeeeeeeeee thi...,0,0.221429,0.510119,0.227,0.107,0.666,0.25895
97854,162,671,reeeeeeeeeeeeeeeeeeeeeeeeeee this is so clearl...,0,0.500000,0.291667,0.098,0.147,0.755,0.67390
97855,162,672,this is so clearly satire! i really do not wan...,0,0.500000,0.291667,0.101,0.152,0.747,0.67390
97856,162,673,he cannot write a story in number minutes pay ...,0,0.500000,0.000000,0.141,0.108,0.751,0.44965


In [7]:

if not CONVERTED:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(LM_MODEL)

    model.max_seq_length = max_lengths[ROLLING_WINDOW_SIZE]
    train_sentence_embeddings = model.encode(train_df['Text'],show_progress_bar=True,\
                output_value='sentence_embedding', batch_size=64,convert_to_numpy=True)

    val_sentence_embeddings = model.encode(test_df['Text'],show_progress_bar=True,\
                output_value='sentence_embedding', batch_size=64,convert_to_numpy=True)

    save_embeddings(f"./train_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl",train_sentence_embeddings)
    save_embeddings(f"./val_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl",val_sentence_embeddings)
else:
    train_sentence_embeddings = load_embeddings(f"./train_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl")
    val_sentence_embeddings = load_embeddings(f"./val_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl")
   
test_df['Vector'] = pd.DataFrame(data=val_sentence_embeddings).values.tolist()
train_df['Vector'] = pd.DataFrame(data=train_sentence_embeddings).values.tolist()

In [8]:

full_df = pd.concat([train_df,test_df])
full_df = full_df.sample(frac=1, random_state=seed).reset_index(drop=True) 

In [9]:

#import ast
#full_df['Vector'] = full_df['Vector'].apply(lambda x: ast.literal_eval(x))
#full_df['Vector']

In [10]:
full_df['Vector'].iloc[0]

[-0.004810236860066652,
 0.09786351025104523,
 0.01352181937545538,
 -0.027487656101584435,
 0.02000696212053299,
 -0.014911876991391182,
 -0.01871095597743988,
 -0.007118774112313986,
 0.04303239658474922,
 0.02504473552107811,
 0.05597524717450142,
 0.040071628987789154,
 -0.0868198350071907,
 0.07660741358995438,
 0.019378187134861946,
 -0.022471707314252853,
 0.006449539680033922,
 -0.008858040906488895,
 -0.019853604957461357,
 -0.02931733801960945,
 0.044363025575876236,
 -0.02347172424197197,
 0.034911420196294785,
 0.024648107588291168,
 0.08573662489652634,
 0.0067977155558764935,
 0.034997548907995224,
 0.0020897609647363424,
 0.052580028772354126,
 -0.011788729578256607,
 0.040029577910900116,
 0.028584441170096397,
 -0.003798366989940405,
 0.019353466108441353,
 2.112770971507416e-06,
 -0.029993761330842972,
 -0.013102305121719837,
 0.0005886050057597458,
 -0.00562269426882267,
 -0.021098123863339424,
 0.03521820530295372,
 -0.10440979897975922,
 -0.0007471283315680921,
 0.

In [11]:
import pickle
#Store sentences & embeddings on disc
def save_embeddings(filepath, embeddings):
    with open(filepath, "wb") as fOut:
        pickle.dump({ 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

#Load sentences & embeddings from disc
def load_embeddings(filepath):
    with open(filepath, "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_embeddings = stored_data['embeddings']
    return stored_embeddings   

In [12]:
import pickle
def save_model(model, name):
    with open(f"{MODEL_PATH}/{name}",'wb') as f:
        pickle.dump( model,f)

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
def custom_cv(model, df, n_folds=5,sent =False):
    skf = StratifiedKFold(n_splits=n_folds)
    user_label_df =df.drop_duplicates('User')
    users = user_label_df['User'].to_numpy()
    
    labels = user_label_df['Label'].to_numpy()
    #print(labels)
    #print(users.shape,labels.shape)
    
    f1_scores = []
    for train_index, test_index in skf.split(users, labels):
        train_users = [users[f] for f in train_index]
        test_users = [users[f] for f in test_index]

        train_folds = df[df['User'].isin(train_users)].copy()
        test_folds = df[df['User'].isin(test_users)].copy()

        X_train = pd.DataFrame(train_folds['Vector'].values.tolist(), index = train_folds.index)
        #X_train = train_folds['Vector']
        y_train = train_folds['Label']
        X_test = pd.DataFrame(test_folds['Vector'].values.tolist(), index = test_folds.index)
        #X_test = test_folds['Vector']
        
        y_test = test_folds['Label']
        if sent:
            X_train = np.c_[X_train,train_folds['polarity'],train_folds['subjectivity'],train_folds['negativity'],train_folds['positivity'],train_folds['neutrality'], train_folds['compound']] 
            X_test = np.c_[X_test,test_folds['polarity'],test_folds['subjectivity'],test_folds['negativity'],test_folds['positivity'],test_folds['neutrality'], test_folds['compound']] 
            

        
        
        model.fit(X_train, y_train)
        f1_scores.append(f1_score(y_test,model.predict(X_test)))

    return f1_scores

## Model part

In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
models = {
        'sgdLR':SGDClassifier(random_state=seed,loss='log'),
        #'NB':MultinomialNB(),\
        'sgdlSVM':SGDClassifier(random_state=seed,loss='hinge'),
        'ExtraTrees':ExtraTreesClassifier(random_state=seed,n_jobs=-1),\
        'Perceptron':Perceptron(random_state=seed)}
if BASELINE_COMP:
        report=""
        best_model_name = ""
        best_model=None
        best_f1=0
        for model_name, model in models.items():
                res = custom_cv(model,full_df)
                if np.mean(res) > best_f1:
                        best_f1=np.mean(res)
                        best_model_name=model_name
                        best_model=model
                #train_df[train_df['User'].isin(flds[0][0])].describe()
                report+=f"{model_name} f1: {round(np.mean(res),3)}\n"
                print(f"{model_name} f1: {round(np.mean(res),3)}")
        with open(f"{MODEL_PATH}/baseline_report_{LM_MODEL}.txt",'w') as f:
                f.write(report)


In [15]:
if BASELINE_COMP:
        res = custom_cv(best_model,full_df, sent=True)
        print(f"{best_model_name} f1: {round(np.mean(res),3)}")

        with open(f"{MODEL_PATH}/baseline_report_{LM_MODEL}.txt",'a') as f:
                f.write(f"\nBest model with sent:\n{best_model_name} f1: {round(np.mean(res),3)}")

# Hyperparameter tuning

#### Best performing model was sgd Logistic Regression for window size 10, using MPNet without SA features, having achieved a 0.823 F1 score

In [16]:
from tqdm import tqdm
import optuna
import joblib

def train_eval_tuning(trial,params, df, sent=False):

    skf = StratifiedKFold(n_splits=5)
    user_label_df =df.drop_duplicates('User')
    users = user_label_df['User'].to_numpy()
    
    labels = user_label_df['Label'].to_numpy()

    
    f1_scores = []
    for fold,(train_index, test_index) in enumerate(skf.split(users, labels)):
        train_users = [users[f] for f in train_index]
        test_users = [users[f] for f in test_index]

        train_folds = df[df['User'].isin(train_users)]
        X_train = pd.DataFrame(train_folds['Vector'].values.tolist(), index = train_folds.index)

        test_folds = df[df['User'].isin(test_users)]
        X_test = pd.DataFrame(test_folds['Vector'].values.tolist(), index = test_folds.index)

        model = SGDClassifier(**params)
        if sent:
            #scaler = MinMaxScaler()
            #train_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']] = scaler.fit_transform(train_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']])
            #test_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']] = scaler.transform(test_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']])
            X_train = np.c_[X_train,train_folds['polarity'],train_folds['subjectivity'],train_folds['negativity'],train_folds['positivity'],train_folds['neutrality'], train_folds['compound']] 
            X_test = np.c_[X_test,test_folds['polarity'],test_folds['subjectivity'],test_folds['negativity'],test_folds['positivity'],test_folds['neutrality'], test_folds['compound']] 
            

        model.fit(X_train, train_folds['Label'])
        f1_scores.append(f1_score(test_folds['Label'],model.predict(X_test)))
    
        trial.report(np.mean(f1_scores), fold)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return f1_scores


In [17]:
def tuning_objective(trial):
    parameters = {
        'max_iter':trial.suggest_int('max_iter',1000,2500,step=500),
        'loss':trial.suggest_categorical('loss',['log']),
        'penalty':trial.suggest_categorical('penalty',['l2','l1','elasticnet']),
        'alpha': trial.suggest_float('alpha',0.00001,0.1,log=True),
        'random_state':trial.suggest_int('random_state',seed,seed)
        
        
    }
    
  
    
    avg_f1 = train_eval_tuning(trial,parameters,full_df, sent=False)
    return np.mean(avg_f1)

In [18]:
study = optuna.create_study(
        study_name=f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}",
        direction='maximize')
study.optimize(tuning_objective, n_trials=50, timeout=(60*60*3))
joblib.dump(study,f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}.pkl")
    #study = optuna.create_study(study_name=f"tfidfvectorizer_cv_{ROLLING_WINDOW_SIZE}",direction="maximize", sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.NopPruner())
study = joblib.load(f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}.pkl")

[32m[I 2022-06-15 20:01:29,761][0m A new study created in memory with name: t1_tuning_10[0m
[32m[I 2022-06-15 20:03:46,844][0m Trial 0 finished with value: 0.813897147575471 and parameters: {'max_iter': 2500, 'loss': 'log', 'penalty': 'elasticnet', 'alpha': 1.8629722287576724e-05, 'random_state': 23}. Best is trial 0 with value: 0.813897147575471.[0m
[32m[I 2022-06-15 20:05:55,720][0m Trial 1 finished with value: 0.8091352749827484 and parameters: {'max_iter': 2000, 'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.0050733694052943114, 'random_state': 23}. Best is trial 0 with value: 0.813897147575471.[0m
[32m[I 2022-06-15 20:08:17,855][0m Trial 2 finished with value: 0.790641379369261 and parameters: {'max_iter': 1000, 'loss': 'log', 'penalty': 'l1', 'alpha': 0.004308190784212204, 'random_state': 23}. Best is trial 0 with value: 0.813897147575471.[0m
[32m[I 2022-06-15 20:10:30,752][0m Trial 3 finished with value: 0.8189108063290986 and parameters: {'max_iter': 2000, 'lo

In [20]:
study.best_trial.params, study.best_value

({'max_iter': 1000,
  'loss': 'log',
  'penalty': 'l2',
  'alpha': 0.0006803245759335398,
  'random_state': 23},
 0.8262780689985009)

In [21]:
with open(f"{MODEL_PATH}/baseline_report_{LM_MODEL}.txt",'a') as f:
                f.write(f"\nOptimized model f1: {round(study.best_value,3)}\nparams: {study.best_trial.params}")

In [22]:
sgdLR_params = study.best_trial.params

final_model = SGDClassifier(**sgdLR_params)
full_train = pd.DataFrame(full_df['Vector'].values.tolist(), index = full_df.index)

final_model.fit(full_train, full_df['Label'])
save_model(final_model, "optimized_sgdLR.pkl")
