# Task 2 (Detecting depressed subjects)
### Using contextualized language models

In [1]:
P_FILE = "../posts.csv"
TRAIN_TOKEN="../train_df.csv"
TEST_TOKEN="../test_df.csv"
GENERAL_MODELS="../Models"
ROLLING_WINDOW_SIZE=10
LM_MODEL='all-mpnet-base-v2'
#'all-MiniLM-L6-v2'  'all-mpnet-base-v2'
TASK=2
CONVERTED=True
SENT_MEASURED=True
BASELINE_COMP=False
MODEL_PATH =f"{GENERAL_MODELS}/LM/win_{ROLLING_WINDOW_SIZE}_{LM_MODEL}" 
max_lengths={1:64,3:128,5:256,10:512}



In [2]:
from pathlib import Path
Path(MODEL_PATH).mkdir(parents=True, exist_ok=True)

### opening resulting dataset with pandas

In [3]:
def rolling_window(df, window_size,stride, field):
    res_map={}
    for user in df['User'].unique():
        user_df = df[df['User']==user]
        res_map[user]=(user_df['Label'].values[0],{})
        posts = user_df[field].values
        iteration=0
        for i in range(0,len(posts),stride):
            res_map[user][1][iteration]=' '.join((posts[i:i+window_size]))
            iteration+=1
    result_df = pd.DataFrame([(k,k1,v1,v[0]) for k,v in res_map.items() for k1,v1 in v[1].items()], columns = ['User','Window_id','Text','Label'])
    
    return result_df

In [4]:
import pickle
#Store sentences & embeddings on disc
def save_embeddings(filepath, embeddings):
    with open(filepath, "wb") as fOut:
        pickle.dump({ 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

#Load sentences & embeddings from disc
def load_embeddings(filepath):
    with open(filepath, "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_embeddings = stored_data['embeddings']
    return stored_embeddings   

In [5]:
import pandas as pd
import numpy as np
import os
seed=23
np.random.seed(seed)


if not SENT_MEASURED:
    train_df = pd.read_csv(TRAIN_TOKEN, sep='\t')
    test_df = pd.read_csv(TEST_TOKEN, sep='\t')
    train_df = rolling_window(train_df,ROLLING_WINDOW_SIZE,1,'Raw')
    test_df = rolling_window(test_df,ROLLING_WINDOW_SIZE,1,'Raw')

    from textblob import TextBlob
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    sia = SentimentIntensityAnalyzer()

    train_df['TB'] = train_df['Text'].apply(lambda text: TextBlob(text).sentiment)
    train_df['VADER'] = train_df['Text'].apply(lambda text: sia.polarity_scores(text))
    train_df['polarity'] = train_df['TB'].apply(lambda tb: (tb[0]+1/2))
    train_df['subjectivity'] = train_df['TB'].apply(lambda tb: tb[1])
    train_df['negativity'] = train_df['VADER'].apply(lambda v: v['neg'])
    train_df['positivity'] = train_df['VADER'].apply(lambda v: v['pos'])
    train_df['neutrality'] = train_df['VADER'].apply(lambda v: v['neu'])
    train_df['compound'] = train_df['VADER'].apply(lambda v: (v['compound']+1)/2)
    train_df.drop(['VADER','TB'], inplace=True, axis=1)

    test_df['TB'] = test_df['Text'].apply(lambda text: TextBlob(text).sentiment)
    test_df['VADER'] = test_df['Text'].apply(lambda text: sia.polarity_scores(text))
    test_df['polarity'] = test_df['TB'].apply(lambda tb: (tb[0]+1/2))
    test_df['subjectivity'] = test_df['TB'].apply(lambda tb: tb[1])
    test_df['negativity'] = test_df['VADER'].apply(lambda v: v['neg'])
    test_df['positivity'] = test_df['VADER'].apply(lambda v: v['pos'])
    test_df['neutrality'] = test_df['VADER'].apply(lambda v: v['neu'])
    test_df['compound'] = test_df['VADER'].apply(lambda v: (v['compound']+1)/2)
    test_df.drop(['VADER','TB'], inplace=True, axis=1)

    train_df.to_pickle(f"train_df_{ROLLING_WINDOW_SIZE}_sent.pkl")
    test_df.to_pickle(f"test_df_{ROLLING_WINDOW_SIZE}_sent.pkl")
train_df = pd.read_pickle(f"train_df_{ROLLING_WINDOW_SIZE}_sent.pkl")
test_df = pd.read_pickle(f"test_df_{ROLLING_WINDOW_SIZE}_sent.pkl")


In [6]:
train_df

Unnamed: 0,User,Window_id,Text,Label,polarity,subjectivity,negativity,positivity,neutrality,compound
0,test_subject1345,0,so many unwanted smith fadeaways. mid range j...,1,0.663051,0.516980,0.053,0.145,0.802,0.98315
1,test_subject1345,1,"mid range jumpers hey guys, celtics fan here p...",1,0.650092,0.517633,0.048,0.147,0.805,0.98440
2,test_subject1345,2,well he got number tonight so maybe he will b...,1,0.588214,0.594881,0.082,0.121,0.797,0.75595
3,test_subject1345,3,i mean he will get pinch hits and an occasion...,1,0.573929,0.588929,0.088,0.107,0.805,0.60815
4,test_subject1345,4,yeah you are probably right. oh well. i gues...,1,0.626531,0.626531,0.091,0.118,0.792,0.63660
...,...,...,...,...,...,...,...,...,...,...
174168,subject9959,627,nothing like that clean house feeling there i...,0,0.834343,0.551515,0.097,0.033,0.869,0.28280
174169,subject9959,628,there is always that one coworker... there is...,0,0.818182,0.477273,0.000,0.000,1.000,0.50000
174170,subject9959,629,there is always that one coworker you just can...,0,0.818182,0.477273,0.000,0.000,1.000,0.50000
174171,subject9959,630,that moment when you realize you need a new jo...,0,0.818182,0.477273,0.000,0.000,1.000,0.50000


In [7]:

if not CONVERTED:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(LM_MODEL)

    model.max_seq_length = max_lengths[ROLLING_WINDOW_SIZE]
    train_sentence_embeddings = model.encode(train_df['Text'],show_progress_bar=True,\
                output_value='sentence_embedding', batch_size=64,convert_to_numpy=True)

    val_sentence_embeddings = model.encode(test_df['Text'],show_progress_bar=True,\
                output_value='sentence_embedding', batch_size=64,convert_to_numpy=True)

    save_embeddings(f"./train_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl",train_sentence_embeddings)
    save_embeddings(f"./val_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl",val_sentence_embeddings)
else:
    train_sentence_embeddings = load_embeddings(f"./train_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl")
    val_sentence_embeddings = load_embeddings(f"./val_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl")
   
test_df['Vector'] = pd.DataFrame(data=val_sentence_embeddings).values.tolist()
train_df['Vector'] = pd.DataFrame(data=train_sentence_embeddings).values.tolist()

In [8]:

full_df = pd.concat([train_df,test_df])
full_df = full_df.sample(frac=1, random_state=seed).reset_index(drop=True) 

In [11]:
import pickle
#Store sentences & embeddings on disc
def save_embeddings(filepath, embeddings):
    with open(filepath, "wb") as fOut:
        pickle.dump({ 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

#Load sentences & embeddings from disc
def load_embeddings(filepath):
    with open(filepath, "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_embeddings = stored_data['embeddings']
    return stored_embeddings   

In [12]:
import pickle
def save_model(model, name):
    with open(f"{MODEL_PATH}/{name}",'wb') as f:
        pickle.dump( model,f)

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
def custom_cv(model, df, n_folds=5,sent =False):
    skf = StratifiedKFold(n_splits=n_folds)
    user_label_df =df.drop_duplicates('User')
    users = user_label_df['User'].to_numpy()
    
    labels = user_label_df['Label'].to_numpy()
    
    f1_scores = []
    for train_index, test_index in skf.split(users, labels):
        train_users = [users[f] for f in train_index]
        test_users = [users[f] for f in test_index]

        train_folds = df[df['User'].isin(train_users)].copy()
        test_folds = df[df['User'].isin(test_users)].copy()

        X_train = pd.DataFrame(train_folds['Vector'].values.tolist(), index = train_folds.index)
        y_train = train_folds['Label']
        X_test = pd.DataFrame(test_folds['Vector'].values.tolist(), index = test_folds.index)
        
        y_test = test_folds['Label']
        if sent:
            X_train = np.c_[X_train,train_folds['polarity'],train_folds['subjectivity'],train_folds['negativity'],train_folds['positivity'],train_folds['neutrality'], train_folds['compound']] 
            X_test = np.c_[X_test,test_folds['polarity'],test_folds['subjectivity'],test_folds['negativity'],test_folds['positivity'],test_folds['neutrality'], test_folds['compound']] 
            

        
        
        model.fit(X_train, y_train)
        f1_scores.append(f1_score(y_test,model.predict(X_test)))

    return f1_scores

## Model part

In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
models = {
        'sgdLR':SGDClassifier(random_state=seed,loss='log'),
        'sgdlSVM':SGDClassifier(random_state=seed,loss='hinge'),
        'ExtraTrees':ExtraTreesClassifier(random_state=seed,n_jobs=-1),\
        'Perceptron':Perceptron(random_state=seed)}
if BASELINE_COMP:
        report=""
        best_model_name = ""
        best_model=None
        best_f1=0
        for model_name, model in models.items():
                res = custom_cv(model,full_df)
                if np.mean(res) > best_f1:
                        best_f1=np.mean(res)
                        best_model_name=model_name
                        best_model=model
                report+=f"{model_name} f1: {round(np.mean(res),3)}\n"
                print(f"{model_name} f1: {round(np.mean(res),3)}")
        with open(f"{MODEL_PATH}/baseline_report_{LM_MODEL}.txt",'w') as f:
                f.write(report)


In [15]:
if BASELINE_COMP:
        res = custom_cv(best_model,full_df, sent=True)
        print(f"{best_model_name} f1: {round(np.mean(res),3)}")

        with open(f"{MODEL_PATH}/baseline_report_{LM_MODEL}.txt",'a') as f:
                f.write(f"\nBest model with sent:\n{best_model_name} f1: {round(np.mean(res),3)}")

# Hyperparameter tuning

#### Best performing model was sgd Logistic Regression for window size 10, using MPNet without SA features, having achieved a 0.667 F1 score.
#### But we didn't consider to be worth taking so much extra time training with MPNet features when MiniLm achieved very similar performance with nearly half the amount of features (F1=0.663 sgdLR, no SA)

In [16]:
from tqdm import tqdm
import optuna
import joblib

def train_eval_tuning(trial,params, df, sent=False):

    skf = StratifiedKFold(n_splits=5)
    user_label_df =df.drop_duplicates('User')
    users = user_label_df['User'].to_numpy()
    
    labels = user_label_df['Label'].to_numpy()

    
    f1_scores = []
    for fold,(train_index, test_index) in enumerate(skf.split(users, labels)):
        train_users = [users[f] for f in train_index]
        test_users = [users[f] for f in test_index]

        train_folds = df[df['User'].isin(train_users)]
        X_train = pd.DataFrame(train_folds['Vector'].values.tolist(), index = train_folds.index)

        test_folds = df[df['User'].isin(test_users)]
        X_test = pd.DataFrame(test_folds['Vector'].values.tolist(), index = test_folds.index)

        model = SGDClassifier(**params)
        if sent:
            X_train = np.c_[X_train,train_folds['polarity'],train_folds['subjectivity'],train_folds['negativity'],train_folds['positivity'],train_folds['neutrality'], train_folds['compound']] 
            X_test = np.c_[X_test,test_folds['polarity'],test_folds['subjectivity'],test_folds['negativity'],test_folds['positivity'],test_folds['neutrality'], test_folds['compound']] 
            

        model.fit(X_train, train_folds['Label'])
        f1_scores.append(f1_score(test_folds['Label'],model.predict(X_test)))
    
        trial.report(np.mean(f1_scores), fold)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return f1_scores


In [17]:
def tuning_objective(trial):
    parameters = {
        'max_iter':trial.suggest_int('max_iter',1000,2500,step=500),
        'loss':trial.suggest_categorical('loss',['log']),
        'penalty':trial.suggest_categorical('penalty',['l2','l1','elasticnet']),
        'alpha': trial.suggest_float('alpha',0.00001,0.1,log=True),
        'random_state':trial.suggest_int('random_state',seed,seed)
        
        
    }
    
  
    
    avg_f1 = train_eval_tuning(trial,parameters,full_df, sent=False)
    return np.mean(avg_f1)

In [22]:
#study = optuna.create_study(
#        study_name=f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}",
#        direction='maximize')
study.optimize(tuning_objective, n_trials=5, timeout=(60*60*12))
joblib.dump(study,f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}.pkl")
study = joblib.load(f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}.pkl")

[32m[I 2022-07-14 09:42:35,769][0m Trial 45 pruned. [0m
[32m[I 2022-07-14 09:46:43,832][0m Trial 46 pruned. [0m
[32m[I 2022-07-14 09:50:13,205][0m Trial 47 pruned. [0m
[32m[I 2022-07-14 10:24:38,583][0m Trial 48 finished with value: 0.6655177122889699 and parameters: {'max_iter': 2000, 'loss': 'log', 'penalty': 'l1', 'alpha': 8.093351246200499e-05, 'random_state': 23}. Best is trial 7 with value: 0.6699096264182774.[0m
[32m[I 2022-07-14 10:28:07,488][0m Trial 49 pruned. [0m


In [23]:
study.best_trial.params, study.best_value

({'max_iter': 2500,
  'loss': 'log',
  'penalty': 'l1',
  'alpha': 0.00011514817252370237,
  'random_state': 23},
 0.6699096264182774)

In [24]:
with open(f"{MODEL_PATH}/baseline_report_{LM_MODEL}.txt",'a') as f:
                f.write(f"\nOptimized model f1: {round(study.best_value,3)}\nparams: {study.best_trial.params}")

In [25]:
sgdLR_params = study.best_trial.params

final_model = SGDClassifier(**sgdLR_params)
full_train = pd.DataFrame(full_df['Vector'].values.tolist(), index = full_df.index)

final_model.fit(full_train, full_df['Label'])
save_model(final_model, "optimized_sgdLR.pkl")
