# Task 1 (Detecting pathological gamblers)
### Using contextualized language models

In [1]:
P_FILE = "../posts.csv"
TRAIN_TOKEN="../train_df.csv"
TEST_TOKEN="../test_df.csv"
GENERAL_MODELS="../Models"
ROLLING_WINDOW_SIZE=10
LM_MODEL="all-mpnet-base-v2"
CONVERTED=True
MODEL_PATH =f"{GENERAL_MODELS}/LM/NN_win_{ROLLING_WINDOW_SIZE}" 



In [2]:
from pathlib import Path
Path(MODEL_PATH).mkdir(parents=True, exist_ok=True)

### opening resulting dataset with pandas

In [3]:
import pandas as pd
import numpy as np
import os
seed=23
np.random.seed(seed)

train_df = pd.read_csv(TRAIN_TOKEN, sep='\t')
test_df = pd.read_csv(TEST_TOKEN, sep='\t')
train_df

Unnamed: 0,User,Post_Nr,Raw,Stemmed,Lemmatized,Label
0,3450,0,"sports betting number k in debt, feeling very ...","sport bet number k in debt , feel veri depress...","sports betting number k in debt , feeling very...",1
1,3450,1,finally accepted that you cannot win gambling ...,final accept that you can not win gambl relaps...,finally accepted that you can not win gambling...,1
2,3450,2,blocking software betfilter has anybody used t...,"block softwar betfilt has anybodi use this , w...",blocking software betfilter has anybody used t...,1
3,3450,3,prone to relapse when in debt? i find that whe...,prone to relaps when in debt ? i find that whe...,prone to relapse when in debt ? i find that wh...,1
4,3450,4,down to my last number on credit card i am num...,down to my last number on credit card i am num...,down to my last number on credit card i am num...,1
...,...,...,...,...,...,...
97853,162,670,you sick fuck,you sick fuck,you sick fuck,0
97854,162,671,reeeeeeeeeeeeeeeeeeeeeeeeeee,reeeeeeeeeeeeeeeeeeeeeeeeeee,reeeeeeeeeeeeeeeeeeeeeeeeeee,0
97855,162,672,this is so clearly satire! i really do not wan...,this is so clear satir ! i realli do not want ...,this is so clearly satire ! i really do not wa...,0
97856,162,673,he cannot write a story in number minutes,he can not write a stori in number minut,he can not write a story in number minutes,0


In [4]:
def rolling_window(df, window_size,stride, field):
    res_map={}
    for user in df['User'].unique():
        user_df = df[df['User']==user]
        res_map[user]=(user_df['Label'].values[0],{})
        posts = user_df[field].values
        iteration=0
        for i in range(0,len(posts),stride):
            res_map[user][1][iteration]=' '.join((posts[i:i+window_size]))
            iteration+=1
    result_df = pd.DataFrame([(k,k1,v1,v[0]) for k,v in res_map.items() for k1,v1 in v[1].items()], columns = ['User','Window_id','Text','Label'])
    
    return result_df

In [5]:
train_df = rolling_window(train_df,ROLLING_WINDOW_SIZE,1,'Raw')
test_df = rolling_window(test_df,ROLLING_WINDOW_SIZE,1,'Raw')

train_df

Unnamed: 0,User,Window_id,Text,Label
0,3450,0,"sports betting number k in debt, feeling very ...",1
1,3450,1,finally accepted that you cannot win gambling ...,1
2,3450,2,blocking software betfilter has anybody used t...,1
3,3450,3,prone to relapse when in debt? i find that whe...,1
4,3450,4,down to my last number on credit card i am num...,1
...,...,...,...,...
97853,162,670,you sick fuck reeeeeeeeeeeeeeeeeeeeeeeeeee thi...,0
97854,162,671,reeeeeeeeeeeeeeeeeeeeeeeeeee this is so clearl...,0
97855,162,672,this is so clearly satire! i really do not wan...,0
97856,162,673,he cannot write a story in number minutes pay ...,0


In [6]:
import pickle
#Store sentences & embeddings on disc
def save_embeddings(filepath, embeddings):
    with open(filepath, "wb") as fOut:
        pickle.dump({ 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

#Load sentences & embeddings from disc
def load_embeddings(filepath):
    with open(filepath, "rb") as fIn:
        stored_data = pickle.load(fIn)
        stored_embeddings = stored_data['embeddings']
    return stored_embeddings   

In [7]:
if not CONVERTED:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(LM_MODEL)

    model.max_seq_length = 512
    train_sentence_embeddings = model.encode(train_df['Text'],show_progress_bar=True,\
                output_value='sentence_embedding', batch_size=64,convert_to_numpy=True)

    val_sentence_embeddings = model.encode(test_df['Text'],show_progress_bar=True,\
                output_value='sentence_embedding', batch_size=64,convert_to_numpy=True)



In [8]:
#if not CONVERTED:
#    save_embeddings(f"./NN_train_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl",train_sentence_embeddings)
#    save_embeddings(f"./NN_val_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl",val_sentence_embeddings)

train_sentence_embeddings = load_embeddings(f"./train_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl")
val_sentence_embeddings = load_embeddings(f"./val_sentence_embeddings_{LM_MODEL}_{ROLLING_WINDOW_SIZE}.pkl")



In [9]:

test_df['Vector'] = pd.DataFrame(data=val_sentence_embeddings).values.tolist()
train_df['Vector'] = pd.DataFrame(data=train_sentence_embeddings).values.tolist()


In [11]:
train_df

Unnamed: 0,User,Window_id,Text,Label,Vector
0,3450,0,"sports betting number k in debt, feeling very ...",1,"[0.001481963787227869, 0.08722537010908127, -0..."
1,3450,1,finally accepted that you cannot win gambling ...,1,"[0.0011301715858280659, 0.09679407626390457, -..."
2,3450,2,blocking software betfilter has anybody used t...,1,"[0.0012601062189787626, 0.08274412155151367, -..."
3,3450,3,prone to relapse when in debt? i find that whe...,1,"[0.018009161576628685, 0.05251404270529747, -0..."
4,3450,4,down to my last number on credit card i am num...,1,"[-0.006508147809654474, 0.08554978668689728, -..."
...,...,...,...,...,...
97853,162,670,you sick fuck reeeeeeeeeeeeeeeeeeeeeeeeeee thi...,0,"[0.04130152612924576, 0.0463968887925148, -0.0..."
97854,162,671,reeeeeeeeeeeeeeeeeeeeeeeeeee this is so clearl...,0,"[0.044361602514982224, 0.03786471486091614, -0..."
97855,162,672,this is so clearly satire! i really do not wan...,0,"[0.04709811136126518, 0.04260839894413948, -0...."
97856,162,673,he cannot write a story in number minutes pay ...,0,"[-0.017870843410491943, 0.004002168774604797, ..."


## Model part

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(seed)


class WritingWindowDataset(Dataset):
    def __init__(self, vectors, labels):

        self.labels = [label for label in labels]
        self.vectors = [vector for vector in vectors]
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_vectors(self, idx):
        # Fetch a batch of inputs
        return self.vectors[idx]

    def __getitem__(self, idx):

        batch_vectors = self.get_batch_vectors(idx)
        batch_y = self.get_batch_labels(idx)
        

        return batch_vectors, batch_y

In [13]:
full_ds = WritingWindowDataset(np.concatenate((train_sentence_embeddings, val_sentence_embeddings), axis=0), pd.concat([train_df['Label'], test_df['Label']]))

In [14]:
class LmNeuralNetwork(nn.Module):
    def __init__(self, trial):
        super(LmNeuralNetwork, self).__init__()
        self.layers=[]
        n_layers = trial.suggest_int("n_layers", 1, 3)

        in_features = 768
        for i in range(n_layers):
            out_features = trial.suggest_int("n_units_l{}".format(i), 4, min(in_features,128))
            self.layers.append(nn.Linear(in_features, out_features))
            self.layers.append(nn.ReLU())
            p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5, step=0.05)
            self.layers.append(nn.Dropout(p))

            in_features = out_features
        self.layers.append(nn.Linear(in_features, 1))
        self.cls_layers = torch.nn.ModuleList(self.layers)
    def forward(self, x):

        for layer in self.cls_layers:
            x = layer(x)
        return torch.sigmoid(x)


In [15]:

trainloader = torch.utils.data.DataLoader(
                full_ds, 
                batch_size=32, 
                shuffle=True)
       

In [23]:
import optuna 
import joblib
study = joblib.load(f"NN_hyperparameterstudy{ROLLING_WINDOW_SIZE}.pkl")
params = optuna.trial.FixedTrial(study.best_trial.params)
model= LmNeuralNetwork(params)

In [24]:
params.params

{'n_layers': 2,
 'n_units_l0': 28,
 'dropout_l0': 0.45,
 'n_units_l1': 17,
 'dropout_l1': 0.4}

In [25]:
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import f1_score
BATCH_SIZE = 32
epochs=10


model = model.cuda()
optimizer = AdamW(model.parameters(), lr=study.best_trial.params['learning_rate'])#, lr= learning_rate)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

criterion = nn.BCELoss()

        

for epoch_num in range(epochs):
    
    total_loss_train = 0
    train_preds = np.array([])
    train_targets = np.array([])
    model.train()
    for train_input, train_label in tqdm(trainloader):

        train_label = train_label.to(device)
        train_label = train_label.float()
        train_label = train_label.unsqueeze(1)
        features = train_input.to(device)
        

        output = model(features)
        batch_loss = criterion(output, train_label)
        total_loss_train += float(batch_loss.item())
        
        train_preds = np.concatenate((train_preds,torch.round(output).detach().cpu().numpy().flatten()),axis=0)
        train_targets = np.concatenate((train_targets,train_label.detach().cpu().numpy().flatten()),axis=0)

        
        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
    train_loss=total_loss_train/len(train_preds)
    train_f1 = f1_score(train_targets,train_preds)
        
    print(
            f'Epochs: {epoch_num + 1} | Train Loss: {train_loss: .4f} \
            | Train F1: {train_f1: .4f}')

    


100%|██████████| 3756/3756 [00:27<00:00, 137.71it/s]


Epochs: 1 | Train Loss:  0.0215             | Train F1:  0.6350


100%|██████████| 3756/3756 [00:12<00:00, 309.97it/s]


Epochs: 2 | Train Loss:  0.0181             | Train F1:  0.7627


100%|██████████| 3756/3756 [00:12<00:00, 312.17it/s]


Epochs: 3 | Train Loss:  0.0147             | Train F1:  0.8098


100%|██████████| 3756/3756 [00:12<00:00, 312.75it/s]


Epochs: 4 | Train Loss:  0.0126             | Train F1:  0.8216


100%|██████████| 3756/3756 [00:12<00:00, 310.54it/s]


Epochs: 5 | Train Loss:  0.0117             | Train F1:  0.8310


100%|██████████| 3756/3756 [00:12<00:00, 306.32it/s]


Epochs: 6 | Train Loss:  0.0111             | Train F1:  0.8370


100%|██████████| 3756/3756 [00:12<00:00, 306.51it/s]


Epochs: 7 | Train Loss:  0.0108             | Train F1:  0.8420


100%|██████████| 3756/3756 [00:12<00:00, 306.37it/s]


Epochs: 8 | Train Loss:  0.0106             | Train F1:  0.8450


100%|██████████| 3756/3756 [00:12<00:00, 310.05it/s]


Epochs: 9 | Train Loss:  0.0104             | Train F1:  0.8498


100%|██████████| 3756/3756 [00:12<00:00, 307.25it/s]


Epochs: 10 | Train Loss:  0.0103             | Train F1:  0.8512


In [26]:
torch.save(model, f"{MODEL_PATH}/final_NN.pt")