# Task 2 (Detecting depressed subjects)
### Using word embeddings

In [1]:
P_FILE = "../posts.csv"
TRAIN_TOKEN="../train_df.csv"
TEST_TOKEN="../test_df.csv"
GENERAL_MODELS="../Models"
ROLLING_WINDOW_SIZE=10
BASELINE_COMP=False
TASK=2
METHOD='mean'
EMBEDDINGS = 'GLOVE_CC'
WORD_EMBEDDINGS = "../../word_embeddings/"
MODEL_PATH =f"{GENERAL_MODELS}/WE/{EMBEDDINGS}_{ROLLING_WINDOW_SIZE}" 
seed=23


In [2]:
from pathlib import Path
Path(MODEL_PATH).mkdir(parents=True, exist_ok=True)

In [3]:
import gensim
import fasttext
we = {"FASTTEXT_CC":"crawl-300d-2M-subword.bin","GLOVE_TT":"glove-twitter-200.bin", "GLOVE_CC":"glove_cc_300d.bin"}


In [4]:
if EMBEDDINGS=='GLOVE_TT':
    embeddings = gensim.models.KeyedVectors.load_word2vec_format(WORD_EMBEDDINGS + "glove-twitter-200.bin", binary=True)
    #embeddings.fill_norms()
    #glove_weights = torch.FloatTensor(glove_embeddings.vectors)
    #glove_embeddings.most_similar("night")
elif EMBEDDINGS=='GLOVE_CC':
    embeddings = gensim.models.KeyedVectors.load_word2vec_format(WORD_EMBEDDINGS + "glove_cc_300d.bin", binary=True)
    #embeddings.fill_norms()

elif EMBEDDINGS=='FASTTEXT_CC':
    embeddings = fasttext.load_model(WORD_EMBEDDINGS + "crawl-300d-2M-subword.bin")
    #fasttext_embeddings.get_nearest_neighbors("night")


### opening resulting dataset with pandas

In [5]:
import pandas as pd
import numpy as np
import os
np.random.seed(seed)

train_df = pd.read_csv(TRAIN_TOKEN, sep='\t')
test_df = pd.read_csv(TEST_TOKEN, sep='\t')
train_df

Unnamed: 0,User,Post_Nr,Raw,Stemmed,Lemmatized,Label
0,test_subject1345,0,so many unwanted smith fadeaways.,so mani unwant smith fadeaway .,so many unwanted smith fadeaways .,1
1,test_subject1345,1,"mid range jumpers hey guys, celtics fan here p...","mid rang jumper hey guy , celtic fan here pull...","mid range jumpers hey guys , celtics fan here ...",1
2,test_subject1345,2,well he got number tonight so maybe he will b...,well he got number tonight so mayb he will be ...,well he got number tonight so maybe he will ...,1
3,test_subject1345,3,i mean he will get pinch hits and an occasion...,i mean he will get pinch hit and an occasion d...,i mean he will get pinch hits and an occasio...,1
4,test_subject1345,4,yeah you are probably right. oh well.,yeah you are probabl right . oh well .,yeah you are probably right . oh well .,1
...,...,...,...,...,...,...
174168,subject9959,627,nothing like that clean house feeling,noth like that clean hous feel,nothing like that clean house feeling,0
174169,subject9959,628,there is always that one coworker...,there is alway that one cowork ...,there is always that one coworker ...,0
174170,subject9959,629,there is always that one coworker you just can...,there is alway that one cowork you just can no...,there is always that one coworker you just can...,0
174171,subject9959,630,that moment when you realize you need a new job,that moment when you realiz you need a new job,that moment when you realize you need a new job,0


In [6]:
def rolling_window(df, window_size,stride, field):
    res_map={}
    for user in df['User'].unique():
        user_df = df[df['User']==user]
        res_map[user]=(user_df['Label'].values[0],{})
        posts = user_df[field].values
        iteration=0
        for i in range(0,len(posts),stride):
            res_map[user][1][iteration]=' '.join((posts[i:i+window_size]))
            iteration+=1
    result_df = pd.DataFrame([(k,k1,v1,v[0]) for k,v in res_map.items() for k1,v1 in v[1].items()], columns = ['User','Window_id','Text','Label'])
    
    return result_df

In [7]:
train_df = rolling_window(train_df,ROLLING_WINDOW_SIZE,1,'Lemmatized')
test_df = rolling_window(test_df,ROLLING_WINDOW_SIZE,1,'Lemmatized')

train_df

Unnamed: 0,User,Window_id,Text,Label
0,test_subject1345,0,so many unwanted smith fadeaways . mid range...,1
1,test_subject1345,1,"mid range jumpers hey guys , celtics fan here ...",1
2,test_subject1345,2,well he got number tonight so maybe he will ...,1
3,test_subject1345,3,i mean he will get pinch hits and an occasio...,1
4,test_subject1345,4,yeah you are probably right . oh well . i ...,1
...,...,...,...,...
174168,subject9959,627,nothing like that clean house feeling there is...,0
174169,subject9959,628,there is always that one coworker ... there is...,0
174170,subject9959,629,there is always that one coworker you just can...,0
174171,subject9959,630,that moment when you realize you need a new jo...,0


In [8]:
train_df = pd.concat([train_df,test_df])
train_df = train_df.sample(frac=1, random_state=seed).reset_index(drop=True) 


In [9]:
import pickle
def save_model(model, name):
    with open(f"{MODEL_PATH}/{name}",'wb') as f:
        pickle.dump( model,f)

## Model part

In [10]:
from tqdm import tqdm
import re
import operator

def build_vocab(sentences):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences):
        for word in re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE):            
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    vocab = dict(sorted(vocab.items(), key=operator.itemgetter(1))[::-1])
    return vocab

In [11]:
vocab_dict = build_vocab(train_df['Text'])
vocab_dict

100%|██████████| 210927/210927 [00:34<00:00, 6031.22it/s]


{'.': 4516700,
 'the': 2716145,
 ',': 2668221,
 'i': 2416414,
 'to': 1963788,
 'a': 1738277,
 'and': 1637573,
 'is': 1447022,
 'number': 1251359,
 'of': 1245039,
 'it': 1212082,
 'not': 1112011,
 'you': 1035802,
 'that': 1025429,
 'in': 942184,
 'for': 717307,
 'have': 655485,
 'my': 592988,
 '?': 578793,
 'are': 549075,
 'on': 537574,
 'but': 523223,
 'was': 509307,
 'this': 508326,
 'do': 496124,
 'with': 485730,
 'be': 437934,
 '!': 387048,
 'am': 386628,
 'they': 373301,
 'as': 364101,
 'so': 351858,
 'if': 349533,
 'would': 341663,
 'just': 335168,
 'me': 334830,
 'can': 333336,
 'like': 332804,
 'he': 323522,
 'or': 312890,
 'at': 303988,
 'what': 300534,
 'will': 280931,
 'your': 260970,
 'about': 259490,
 'out': 248961,
 'all': 248371,
 'from': 247054,
 'we': 228046,
 'one': 226518,
 'there': 225753,
 's': 219821,
 'up': 215318,
 'get': 211755,
 'an': 209485,
 'people': 203490,
 'when': 199618,
 'more': 188703,
 'how': 186413,
 'no': 173545,
 'because': 171486,
 'did': 168562,


In [12]:
import operator

def check_embedding_coverage(token_vocab,embeddings):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(token_vocab):
        try:
            a[word] = embeddings[word]
            k += token_vocab[word]
        except:

            oov[word] = token_vocab[word]
            i += token_vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(token_vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x
    


In [13]:
#fasttext_embeddings['dsfjsdkjfh']

In [14]:
#check_embedding_coverage(vocab_dict, embeddings)


In [15]:
if 'GLOVE' in EMBEDDINGS:
    embeddings.add_vector('<pad>',np.zeros((embeddings.vector_size)))
    embeddings.add_vector('<unk>',np.mean(embeddings.vectors,axis=0, keepdims=True)[0])
elif EMBEDDINGS=='FASTTEXT':
    pass



In [16]:

if 'GLOVE' in EMBEDDINGS:
    
    train_df['Tokens'] = train_df['Text'].apply(lambda s:[embeddings.key_to_index.get(tok,embeddings.key_to_index['<unk>']) for tok in re.findall(r"\w+|[^\w\s]", s, re.UNICODE)])
    #test_df['Tokens'] = test_df['Text'].apply(lambda s:[embeddings.key_to_index.get(tok,glove_embeddings.key_to_index['<unk>']) for tok in s.split(" ")])
#elif EMBEDDINGS=='FASTTEXT':
#    train_df['Tokens'] = train_df['Text'].apply(lambda s:[fasttext_embeddings[tok] for tok in s.split(" ")])
#    test_df['Tokens'] = test_df['Text'].apply(lambda s:[fasttext_embeddings[tok] for tok in s.split(" ")])
#X_train = X_train.apply(lambda s:np.mean([glove_embeddings.get(tok,glove_embeddings['<unk>']) for tok in s.split(" ")]))
    #train_df['Tokens']


In [17]:
import numpy as np
def vectorize_post(post, embeddings,method='mean'):
    #Create document vectors by averaging word vectors. Remove out-of-vocabulary words.
    #TODO: represent oov words with a random vector generated with the embeddings mean and std_dev
    
    vecs = []
    for token in post:
        #print(word)
        #if word in embeddings:
        #vecs+=[embeddings[token]]
        vecs+=[embeddings[token]]
        
        #vecs+=[embeddings.get_vector(token, norm=True)]
        
        #else:
        #    vecs+=[np.zeros(embeddings.vector_size)]
    if method=='mean':
        return np.mean(vecs, axis=0)
    elif method=='max':
        return np.max(vecs, axis=0)
    elif method=='min':
        return np.min(vecs, axis=0)
        
    
    #return embeddings.get_mean_vector(post, pre_normalize=True)


In [18]:
import re

if 'GLOVE' in EMBEDDINGS:
    train_df['Vector'] = train_df['Tokens'].apply(lambda post:vectorize_post(post,embeddings,METHOD))
    #test_df['Vector'] = test_df['Tokens'].apply(lambda post:vectorize_post(post,embeddings,METHOD))

elif 'FASTTEXT' in EMBEDDINGS:
    train_df['Vector'] = train_df['Text'].apply(lambda post:vectorize_post(re.findall(r"\w+|[^\w\s]", post, re.UNICODE),embeddings,METHOD))
    #test_df['Vector'] = test_df['Text'].apply(lambda post:vectorize_post(post.split(" "),embeddings,METHOD))

#X_train_df = pd.DataFrame(train_df['Vectors'].to_list(), columns = [i for i in range(200)])
#X_train_df
"""
X_train = pd.DataFrame(train_df['Vector'].values.tolist(), index = train_df.index)
y_train = train_df['Label']
X_test = pd.DataFrame(test_df['Vector'].values.tolist(), index = test_df.index)

#X_test = test_df[['User','Window_id','Vector']]
y_test = test_df['Label']
X_train
"""




"\nX_train = pd.DataFrame(train_df['Vector'].values.tolist(), index = train_df.index)\ny_train = train_df['Label']\nX_test = pd.DataFrame(test_df['Vector'].values.tolist(), index = test_df.index)\n\n#X_test = test_df[['User','Window_id','Vector']]\ny_test = test_df['Label']\nX_train\n"

In [19]:
"""
def objective(trial):
    parameters = {
        'embeddings':trial.suggest_categorical('embeddings',list(we.keys())),
        'model':trial.suggest_categorical('model',list(models.keys())),
        
        
    }
    
  
    
    avg_f1 = train_eval(trial,parameters,5,train_df)
    return np.mean(avg_f1)
"""

"\ndef objective(trial):\n    parameters = {\n        'embeddings':trial.suggest_categorical('embeddings',list(we.keys())),\n        'model':trial.suggest_categorical('model',list(models.keys())),\n        \n        \n    }\n    \n  \n    \n    avg_f1 = train_eval(trial,parameters,5,train_df)\n    return np.mean(avg_f1)\n"

In [20]:
"""
from tqdm import tqdm
from sklearn.metrics import f1_score

def train_eval(trial,params, n_folds, df):

    skf = StratifiedKFold(n_splits=n_folds)
    user_label_df =df.drop_duplicates('User')
    users = user_label_df['User'].to_numpy()
    
    labels = user_label_df['Label'].to_numpy()

    #format the vectorizer can accept
    params['stop_words'] = sw_lists[params['stop_words']]
    params['ngram_range'] = (1,params['ngram_range'])
    
    f1_scores = []
    for fold,(train_index, test_index) in enumerate(skf.split(users, labels)):
        transformer = TfidfVectorizer(**params)
        train_users = [users[f] for f in train_index]
        test_users = [users[f] for f in test_index]

        train_folds = df[df['User'].isin(train_users)]
        X_train = transformer.fit_transform(train_folds['Text'])
        test_folds = df[df['User'].isin(test_users)]
        X_test = transformer.transform(test_folds['Text'])
    
        model = MultinomialNB()
        model.fit(X_train, train_folds['Label'])
        f1_scores.append(f1_score(test_folds['Label'],model.predict(X_test)))
    
        trial.report(np.mean(f1_scores), fold)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return f1_scores
"""

"\nfrom tqdm import tqdm\nfrom sklearn.metrics import f1_score\n\ndef train_eval(trial,params, n_folds, df):\n\n    skf = StratifiedKFold(n_splits=n_folds)\n    user_label_df =df.drop_duplicates('User')\n    users = user_label_df['User'].to_numpy()\n    \n    labels = user_label_df['Label'].to_numpy()\n\n    #format the vectorizer can accept\n    params['stop_words'] = sw_lists[params['stop_words']]\n    params['ngram_range'] = (1,params['ngram_range'])\n    \n    f1_scores = []\n    for fold,(train_index, test_index) in enumerate(skf.split(users, labels)):\n        transformer = TfidfVectorizer(**params)\n        train_users = [users[f] for f in train_index]\n        test_users = [users[f] for f in test_index]\n\n        train_folds = df[df['User'].isin(train_users)]\n        X_train = transformer.fit_transform(train_folds['Text'])\n        test_folds = df[df['User'].isin(test_users)]\n        X_test = transformer.transform(test_folds['Text'])\n    \n        model = MultinomialNB()\n 

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
#from sklearn.preprocessing import MinMaxScaler


def custom_cv(model, df, n_folds=5,sent =False ,threshold=None):
    skf = StratifiedKFold(n_splits=n_folds)
    user_label_df =df.drop_duplicates('User')
    users = user_label_df['User'].to_numpy()
    
    labels = user_label_df['Label'].to_numpy()
    #print(labels)
    #print(users.shape,labels.shape)
    
    f1_scores = []
    for train_index, test_index in skf.split(users, labels):
        train_users = [users[f] for f in train_index]
        test_users = [users[f] for f in test_index]

        train_folds = df[df['User'].isin(train_users)].copy()
        test_folds = df[df['User'].isin(test_users)].copy()
        #train_folds['Tokens'] = train_folds['Text'].apply(lambda s:[glove_embeddings.key_to_index.get(tok,glove_embeddings.key_to_index['<unk>']) for tok in s.split(" ")])
        #test_folds['Tokens'] = test_folds['Text'].apply(lambda s:[glove_embeddings.key_to_index.get(tok,glove_embeddings.key_to_index['<unk>']) for tok in s.split(" ")])
        #train_folds['Vector'] = train_folds['Tokens'].apply(lambda post:vectorize_post(post,glove_embeddings,METHOD))
        #test_folds['Vector'] = test_folds['Tokens'].apply(lambda post:vectorize_post(post,glove_embeddings,METHOD))
        X_train = pd.DataFrame(train_folds['Vector'].values.tolist(), index = train_folds.index)
        y_train = train_folds['Label']
        X_test = pd.DataFrame(test_folds['Vector'].values.tolist(), index = test_folds.index)
        y_test = test_folds['Label']
        if sent:
            #scaler = MinMaxScaler()
            #train_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']] = scaler.fit_transform(train_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']])
            #test_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']] = scaler.transform(test_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']])
            X_train = np.c_[X_train,train_folds['polarity'],train_folds['subjectivity'],train_folds['negativity'],train_folds['positivity'],train_folds['neutrality'], train_folds['compound']] 
            X_test = np.c_[X_test,test_folds['polarity'],test_folds['subjectivity'],test_folds['negativity'],test_folds['positivity'],test_folds['neutrality'], test_folds['compound']] 
            

        
        
        model.fit(X_train, y_train)
        f1_scores.append(f1_score(y_test,model.predict(X_test)))

    return f1_scores

In [22]:
#X_train = X_train.apply(lambda x: vectorize_post(x, glove_embeddings))
#X_train
#train_df

In [23]:

#X_train[0].shape
#if sentiment:
#    X_train = np.c_[X_train,train_df['polarity'],train_df['subjectivity'],train_df['negativity'],train_df['positivity'],train_df['neutrality'], train_df['compound']] 
#    X_test = np.c_[X_test,test_df['polarity'],test_df['subjectivity'],test_df['negativity'],test_df['positivity'],test_df['neutrality'], test_df['compound']] 



In [24]:
train_df['Vector']

0         [4.671169e-05, -0.034060024, -0.021581221, -0....
1         [0.007912025, -0.043180943, -0.08589363, -0.12...
2         [0.018356469, 0.012983114, -0.0352655, -0.0779...
3         [-0.021287752, 0.0005717801, -0.08342346, -0.2...
4         [-0.01959065, 0.013299365, -0.057640214, -0.09...
                                ...                        
210922    [-0.050597902, 0.023176132, -0.09026915, -0.07...
210923    [-0.024048736, 0.014176603, -0.045604512, -0.1...
210924    [-0.04207734, -0.010719161, -0.09014639, -0.07...
210925    [-0.023640716, 0.0012010762, -0.01621502, -0.1...
210926    [-0.025706569, -0.043355275, -0.020992147, -0....
Name: Vector, Length: 210927, dtype: object

In [25]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import MultinomialNB
models = {
        'sgdLR':SGDClassifier(random_state=seed,loss='log'),
        #'NB':MultinomialNB(),\
        'sgdlSVM':SGDClassifier(random_state=seed,loss='hinge'),
        'ExtraTrees':ExtraTreesClassifier(random_state=seed,n_jobs=-1),\
        'Perceptron':Perceptron(random_state=seed)}
if BASELINE_COMP:
        report=""
        best_model_name = ""
        best_model=None
        best_f1=0
        for model_name, model in models.items():
                res = custom_cv(model,train_df)
                if np.mean(res) > best_f1:
                        best_f1=np.mean(res)
                        best_model_name=model_name
                        best_model=model
                #train_df[train_df['User'].isin(flds[0][0])].describe()
                report+=f"{model_name} f1: {round(np.mean(res),3)}\n"
                print(f"{model_name} f1: {round(np.mean(res),3)}")
        with open(f"{MODEL_PATH}/baseline_report_{EMBEDDINGS}.txt",'w') as f:
                f.write(report)


In [26]:
if BASELINE_COMP:
    from textblob import TextBlob
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    sia = SentimentIntensityAnalyzer()
    train_df['TB'] = train_df['Text'].apply(lambda text: TextBlob(text).sentiment)
    train_df['VADER'] = train_df['Text'].apply(lambda text: sia.polarity_scores(text))


    train_df['polarity'] = train_df['TB'].apply(lambda tb: (tb[0]+1/2))
    train_df['subjectivity'] = train_df['TB'].apply(lambda tb: tb[1])
    train_df['negativity'] = train_df['VADER'].apply(lambda v: v['neg'])
    train_df['positivity'] = train_df['VADER'].apply(lambda v: v['pos'])
    train_df['neutrality'] = train_df['VADER'].apply(lambda v: v['neu'])
    train_df['compound'] = train_df['VADER'].apply(lambda v: (v['compound']+1)/2)



    train_df.drop(['VADER','TB'], inplace=True, axis=1)

In [27]:
#best_model_name = "sgdLR"
#best_model = models[best_model_name]

In [28]:
if BASELINE_COMP:

        res = custom_cv(best_model,train_df, sent=True)
        print(f"{best_model_name} f1: {round(np.mean(res),3)}")
        #ADD CV WHEN CHANGING
        with open(f"{MODEL_PATH}/baseline_report_{EMBEDDINGS}.txt",'a') as f:
                f.write(f"\nBest model with sent:\n{best_model_name} f1: {round(np.mean(res),3)}")

# Hyperparameter tuning

#### Best performing model was sgd Logistic Regression for window size 10, GLOVE_CC without SA features, having achieved a 0.664 F1 score

In [29]:
from tqdm import tqdm
import optuna
import joblib

def train_eval_tuning(trial,params, df, sent=False):

    skf = StratifiedKFold(n_splits=5)
    user_label_df =df.drop_duplicates('User')
    users = user_label_df['User'].to_numpy()
    
    labels = user_label_df['Label'].to_numpy()

    
    f1_scores = []
    for fold,(train_index, test_index) in enumerate(skf.split(users, labels)):
        train_users = [users[f] for f in train_index]
        test_users = [users[f] for f in test_index]

        train_folds = df[df['User'].isin(train_users)]
        X_train = pd.DataFrame(train_folds['Vector'].values.tolist(), index = train_folds.index)

        test_folds = df[df['User'].isin(test_users)]
        X_test = pd.DataFrame(test_folds['Vector'].values.tolist(), index = test_folds.index)

        model = SGDClassifier(**params)
        if sent:
            #scaler = MinMaxScaler()
            #train_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']] = scaler.fit_transform(train_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']])
            #test_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']] = scaler.transform(test_folds[['polarity','subjectivity','negativity','positivity','neutrality','compound']])
            X_train = np.c_[X_train,train_folds['polarity'],train_folds['subjectivity'],train_folds['negativity'],train_folds['positivity'],train_folds['neutrality'], train_folds['compound']] 
            X_test = np.c_[X_test,test_folds['polarity'],test_folds['subjectivity'],test_folds['negativity'],test_folds['positivity'],test_folds['neutrality'], test_folds['compound']] 
            

        model.fit(X_train, train_folds['Label'])
        f1_scores.append(f1_score(test_folds['Label'],model.predict(X_test)))
    
        trial.report(np.mean(f1_scores), fold)

        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return f1_scores


In [30]:
def tuning_objective(trial):
    parameters = {
        'max_iter':trial.suggest_int('max_iter',1000,2500,step=500),
        'loss':trial.suggest_categorical('loss',['log']),
        'penalty':trial.suggest_categorical('penalty',['l2','l1','elasticnet']),
        'alpha': trial.suggest_float('alpha',0.00001,0.1,log=True),
        'random_state':trial.suggest_int('random_state',seed,seed)
        
        
    }
    
  
    
    avg_f1 = train_eval_tuning(trial,parameters,train_df, sent=False)
    return np.mean(avg_f1)

In [31]:
study = optuna.create_study(
        study_name=f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}",
        direction='maximize')
study.optimize(tuning_objective, n_trials=50, timeout=(60*60*3))
joblib.dump(study,f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}.pkl")
    #study = optuna.create_study(study_name=f"tfidfvectorizer_cv_{ROLLING_WINDOW_SIZE}",direction="maximize", sampler=optuna.samplers.TPESampler(), pruner=optuna.pruners.NopPruner())
study = joblib.load(f"t{TASK}_tuning_{ROLLING_WINDOW_SIZE}.pkl")

[32m[I 2022-06-15 18:57:15,757][0m A new study created in memory with name: t2_tuning_10[0m
[32m[I 2022-06-15 18:58:47,004][0m Trial 0 finished with value: 0.6463409252051037 and parameters: {'max_iter': 2000, 'loss': 'log', 'penalty': 'l2', 'alpha': 0.0020111304744450066, 'random_state': 23}. Best is trial 0 with value: 0.6463409252051037.[0m
[32m[I 2022-06-15 19:00:22,923][0m Trial 1 finished with value: 0.48546192282948475 and parameters: {'max_iter': 2500, 'loss': 'log', 'penalty': 'l1', 'alpha': 0.011087888556563064, 'random_state': 23}. Best is trial 0 with value: 0.6463409252051037.[0m
[32m[I 2022-06-15 19:01:47,260][0m Trial 2 finished with value: 0.4488368514406533 and parameters: {'max_iter': 1000, 'loss': 'log', 'penalty': 'l2', 'alpha': 0.053122523966708794, 'random_state': 23}. Best is trial 0 with value: 0.6463409252051037.[0m
[32m[I 2022-06-15 19:03:11,626][0m Trial 3 finished with value: 0.46664696951417384 and parameters: {'max_iter': 2500, 'loss': 'log',

In [33]:
study.best_trial.params, study.best_value

({'max_iter': 1000,
  'loss': 'log',
  'penalty': 'l2',
  'alpha': 4.289226463460141e-05,
  'random_state': 23},
 0.6832959236146376)

In [34]:
with open(f"{MODEL_PATH}/baseline_report_{EMBEDDINGS}.txt",'a') as f:
                f.write(f"\nOptimized model f1: {round(study.best_value,3)}\nparams: {study.best_trial.params}")

In [35]:
sgdLR_params = study.best_trial.params

final_model = SGDClassifier(**sgdLR_params)
full_train = pd.DataFrame(train_df['Vector'].values.tolist(), index = train_df.index)

final_model.fit(full_train, train_df['Label'])
save_model(final_model, "optimized_sgdLR.pkl")
