# Task 2 (Detecting depressed subjects)
### Using contextualized language models

In [1]:
P_FILE = "../posts.csv"
TRAIN_TOKEN="../train_df.csv"
TEST_TOKEN="../test_df.csv"
GENERAL_MODELS="../Models"
ROLLING_WINDOW_SIZE=10
LM_MODEL="all-mpnet-base-v2"#"all-MiniLM-L6-v2"
CONVERTED=True
MODEL_PATH =f"{GENERAL_MODELS}/LM/NN_win_{ROLLING_WINDOW_SIZE}" 



In [2]:
from pathlib import Path
Path(MODEL_PATH).mkdir(parents=True, exist_ok=True)

### opening resulting dataset with pandas

In [3]:
import pandas as pd
import numpy as np
import os
seed=23
np.random.seed(seed)

train_df = pd.read_csv(TRAIN_TOKEN, sep='\t')
test_df = pd.read_csv(TEST_TOKEN, sep='\t')
train_df

Unnamed: 0,User,Post_Nr,Raw,Stemmed,Lemmatized,Label
0,test_subject1345,0,so many unwanted smith fadeaways.,so mani unwant smith fadeaway .,so many unwanted smith fadeaways .,1
1,test_subject1345,1,"mid range jumpers hey guys, celtics fan here p...","mid rang jumper hey guy , celtic fan here pull...","mid range jumpers hey guys , celtics fan here ...",1
2,test_subject1345,2,well he got number tonight so maybe he will b...,well he got number tonight so mayb he will be ...,well he got number tonight so maybe he will ...,1
3,test_subject1345,3,i mean he will get pinch hits and an occasion...,i mean he will get pinch hit and an occasion d...,i mean he will get pinch hits and an occasio...,1
4,test_subject1345,4,yeah you are probably right. oh well.,yeah you are probabl right . oh well .,yeah you are probably right . oh well .,1
...,...,...,...,...,...,...
174168,subject9959,627,nothing like that clean house feeling,noth like that clean hous feel,nothing like that clean house feeling,0
174169,subject9959,628,there is always that one coworker...,there is alway that one cowork ...,there is always that one coworker ...,0
174170,subject9959,629,there is always that one coworker you just can...,there is alway that one cowork you just can no...,there is always that one coworker you just can...,0
174171,subject9959,630,that moment when you realize you need a new job,that moment when you realiz you need a new job,that moment when you realize you need a new job,0


In [4]:
def rolling_window(df, window_size,stride, field):
    res_map={}
    for user in df['User'].unique():
        user_df = df[df['User']==user]
        res_map[user]=(user_df['Label'].values[0],{})
        posts = user_df[field].values
        iteration=0
        for i in range(0,len(posts),stride):
            res_map[user][1][iteration]=' '.join((posts[i:i+window_size]))
            iteration+=1
    result_df = pd.DataFrame([(k,k1,v1,v[0]) for k,v in res_map.items() for k1,v1 in v[1].items()], columns = ['User','Window_id','Text','Label'])
    
    return result_df

In [5]:
train_df = rolling_window(train_df,ROLLING_WINDOW_SIZE,1,'Raw')
test_df = rolling_window(test_df,ROLLING_WINDOW_SIZE,1,'Raw')

train_df

Unnamed: 0,User,Window_id,Text,Label
0,test_subject1345,0,so many unwanted smith fadeaways. mid range j...,1
1,test_subject1345,1,"mid range jumpers hey guys, celtics fan here p...",1
2,test_subject1345,2,well he got number tonight so maybe he will b...,1
3,test_subject1345,3,i mean he will get pinch hits and an occasion...,1
4,test_subject1345,4,yeah you are probably right. oh well. i gues...,1
...,...,...,...,...
174168,subject9959,627,nothing like that clean house feeling there i...,0
174169,subject9959,628,there is always that one coworker... there is...,0
174170,subject9959,629,there is always that one coworker you just can...,0
174171,subject9959,630,that moment when you realize you need a new jo...,0


In [6]:
import pickle
#Store sentences & embeddings on disc
def save_embeddings(filepath, embeddings):
    with open(filepath, "wb") as fOut:
        pickle.dump({ 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

#Load sentences & embeddings from disc
def load_embeddings(filepath):
    with open(filepath, "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_embeddings = stored_data['embeddings']
    return stored_embeddings   

In [7]:
if not CONVERTED:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(LM_MODEL)

    model.max_seq_length = 512
    train_sentence_embeddings = model.encode(train_df['Text'],show_progress_bar=True,\
                output_value='sentence_embedding', batch_size=64,convert_to_numpy=True)

    val_sentence_embeddings = model.encode(test_df['Text'],show_progress_bar=True,\
                output_value='sentence_embedding', batch_size=64,convert_to_numpy=True)



In [8]:
#if not CONVERTED:
#    save_embeddings(f"./NN_train_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl",train_sentence_embeddings)
#    save_embeddings(f"./NN_val_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl",val_sentence_embeddings)

train_sentence_embeddings = load_embeddings(f"./train_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl")
val_sentence_embeddings = load_embeddings(f"./val_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl")



In [9]:

test_df['Vector'] = pd.DataFrame(data=val_sentence_embeddings).values.tolist()
train_df['Vector'] = pd.DataFrame(data=train_sentence_embeddings).values.tolist()


In [11]:
train_df

Unnamed: 0,User,Window_id,Text,Label,Vector
0,test_subject1345,0,so many unwanted smith fadeaways. mid range j...,1,"[-0.054277509450912476, 0.0824999287724495, 0...."
1,test_subject1345,1,"mid range jumpers hey guys, celtics fan here p...",1,"[-0.054094888269901276, 0.06312193721532822, 0..."
2,test_subject1345,2,well he got number tonight so maybe he will b...,1,"[-0.047845326364040375, 0.0940595269203186, -0..."
3,test_subject1345,3,i mean he will get pinch hits and an occasion...,1,"[-0.04452856630086899, 0.09207145869731903, -0..."
4,test_subject1345,4,yeah you are probably right. oh well. i gues...,1,"[-0.03719930350780487, 0.08828624337911606, -0..."
...,...,...,...,...,...
174168,subject9959,627,nothing like that clean house feeling there i...,0,"[-0.014173840172588825, 0.08119028061628342, 0..."
174169,subject9959,628,there is always that one coworker... there is...,0,"[0.006969124544411898, 0.08165775239467621, -0..."
174170,subject9959,629,there is always that one coworker you just can...,0,"[-0.014388704672455788, 0.09135324507951736, -..."
174171,subject9959,630,that moment when you realize you need a new jo...,0,"[-0.0011959681287407875, 0.09174152463674545, ..."


## Model part

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(seed)


class WritingWindowDataset(Dataset):
    def __init__(self, vectors, labels):

        self.labels = [label for label in labels]
        self.vectors = [vector for vector in vectors]
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_vectors(self, idx):
        # Fetch a batch of inputs
        return self.vectors[idx]

    def __getitem__(self, idx):

        batch_vectors = self.get_batch_vectors(idx)
        batch_y = self.get_batch_labels(idx)
        

        return batch_vectors, batch_y

In [13]:
full_ds = WritingWindowDataset(np.concatenate((train_sentence_embeddings, val_sentence_embeddings), axis=0), pd.concat([train_df['Label'], test_df['Label']]))



In [14]:
class LmNeuralNetwork(nn.Module):
    def __init__(self, trial):
        super(LmNeuralNetwork, self).__init__()
        self.layers=[]
        n_layers = trial.suggest_int("n_layers", 1, 3)

        in_features = 768
        for i in range(n_layers):
            out_features = trial.suggest_int("n_units_l{}".format(i), 4, min(in_features,128))
            self.layers.append(nn.Linear(in_features, out_features))
            self.layers.append(nn.ReLU())
            p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5, step=0.05)
            self.layers.append(nn.Dropout(p))

            in_features = out_features
        self.layers.append(nn.Linear(in_features, 1))
        self.cls_layers = torch.nn.ModuleList(self.layers)
    def forward(self, x):

        for layer in self.cls_layers:
            x = layer(x)
        return torch.sigmoid(x)


In [15]:

trainloader = torch.utils.data.DataLoader(
                full_ds, 
                batch_size=32, 
                shuffle=True)
       

In [16]:
import optuna 
import joblib
study = joblib.load(f"NN_hyperparameterstudy{ROLLING_WINDOW_SIZE}.pkl")
params = optuna.trial.FixedTrial(study.best_trial.params)
model= LmNeuralNetwork(params)

In [17]:
params.params

{'n_layers': 3,
 'n_units_l0': 6,
 'dropout_l0': 0.5,
 'n_units_l1': 6,
 'dropout_l1': 0.30000000000000004,
 'n_units_l2': 5,
 'dropout_l2': 0.5}

In [18]:
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import f1_score
BATCH_SIZE = 32
epochs=10


model = model.cuda()
optimizer = AdamW(model.parameters(), lr=study.best_trial.params['learning_rate'])#, lr= learning_rate)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

criterion = nn.BCELoss()

        

for epoch_num in range(epochs):
    
    total_loss_train = 0
    train_preds = np.array([])
    train_targets = np.array([])
    model.train()
    for train_input, train_label in tqdm(trainloader):

        train_label = train_label.to(device)
        train_label = train_label.float()
        train_label = train_label.unsqueeze(1)
        features = train_input.to(device)
        

        output = model(features)
        batch_loss = criterion(output, train_label)
        total_loss_train += float(batch_loss.item())
        
        train_preds = np.concatenate((train_preds,torch.round(output).detach().cpu().numpy().flatten()),axis=0)
        train_targets = np.concatenate((train_targets,train_label.detach().cpu().numpy().flatten()),axis=0)

        
        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
    train_loss=total_loss_train/len(train_preds)
    train_f1 = f1_score(train_targets,train_preds)
        
    print(
            f'Epochs: {epoch_num + 1} | Train Loss: {train_loss: .4f} \
            | Train F1: {train_f1: .4f}')

    


100%|██████████| 6592/6592 [00:47<00:00, 139.34it/s]


Epochs: 1 | Train Loss:  0.0210             | Train F1:  0.1347


100%|██████████| 6592/6592 [00:23<00:00, 280.13it/s]


Epochs: 2 | Train Loss:  0.0195             | Train F1:  0.4437


100%|██████████| 6592/6592 [00:23<00:00, 275.55it/s]


Epochs: 3 | Train Loss:  0.0177             | Train F1:  0.5821


100%|██████████| 6592/6592 [00:23<00:00, 276.16it/s]


Epochs: 4 | Train Loss:  0.0171             | Train F1:  0.6231


100%|██████████| 6592/6592 [00:24<00:00, 273.90it/s]


Epochs: 5 | Train Loss:  0.0167             | Train F1:  0.6390


100%|██████████| 6592/6592 [00:23<00:00, 277.07it/s]


Epochs: 6 | Train Loss:  0.0166             | Train F1:  0.6486


100%|██████████| 6592/6592 [00:23<00:00, 276.68it/s]


Epochs: 7 | Train Loss:  0.0164             | Train F1:  0.6540


100%|██████████| 6592/6592 [00:23<00:00, 276.45it/s]


Epochs: 8 | Train Loss:  0.0163             | Train F1:  0.6583


100%|██████████| 6592/6592 [00:23<00:00, 277.50it/s]


Epochs: 9 | Train Loss:  0.0162             | Train F1:  0.6605


100%|██████████| 6592/6592 [00:23<00:00, 276.04it/s]


Epochs: 10 | Train Loss:  0.0162             | Train F1:  0.6590


In [19]:
torch.save(model, f"{MODEL_PATH}/final_NN.pt")