In [88]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from datetime import datetime
import pandas as pd
import numpy as np
import statistics
import re

import csv
import time


In [89]:
MAX_LEN = 128
BATCH_SIZE = 32
TEST_BATCH_SIZE = 128
TRAIN_VAL_SIZE = 0.2
VAL_TEST_SIZE = 0.5
EPOCH = 3

In [90]:
def get_metrics(true_labels, predictions):
  f1 = f1_score(true_labels, predictions, average="macro")
  precision = precision_score(true_labels, predictions, average="macro")
  recall = recall_score(true_labels, predictions, average="macro")
  accuracy = accuracy_score(true_labels,predictions)
  return f1, precision, recall, accuracy

# Prepare the data

In [92]:
bbc_df = pd.read_csv("../datasets/bbc/bbc-text.csv")
bbc_df["category"] = pd.Categorical(bbc_df['category']).codes
labels = bbc_df["category"].unique()


In [48]:
def clean_text(text):
    text=re.sub('<br \/>','',text) 
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text) 
    text = re.sub('\[[^]]*\]', '', text)
    return text

In [54]:
movies_df = pd.read_csv("../datasets/movies/sampled.csv")
movies_df["sentiment"] = pd.Categorical(movies_df['sentiment']).codes
movies_df

Unnamed: 0,text,sentiment
0,A female vampire kills young women and paints ...,0
1,Personally I think this show looks pretty chea...,0
2,I grew up watching Inspector Gadget It was and...,0
3,This movie is awful Im SORRY I bought this to ...,0
4,This is a great example of a good dumb movie N...,1
...,...,...
4995,After watching this on the MST3K episode I hav...,0
4996,Upon completing this infernal piece of trash a...,0
4997,Maybe Im biased because the F16 is my favorite...,1
4998,The Best Movie of the 90s The Welsh Trainspott...,0


In [62]:
twitter_df = pd.read_csv("../datasets/twitter_sampled.csv")
labels = twitter_df["label"].unique()
twitter_df["label"] = twitter_df["label"].apply(lambda x: 1 if x==4 else 0)

In [63]:
twitter_df

Unnamed: 0,label,text
0,1,why and she screaming ahaha this song is funny
1,0,the_trini_bajan work as usual
2,0,desi_f pack me in your luggage I wanna go
3,1,elm8 Thanks I enjoy talking to you too
4,1,watchin the season finale of The Office lets h...
...,...,...
4795,0,So sleepy this morning
4796,0,bakespace do you archive your newsletters some...
4797,1,santyadh hope that will soon change though bo...
4798,0,I think I should do my homework


# BERT Embeddings

In [93]:
bert_model = AutoModel.from_pretrained("bert-base-uncased", output_hidden_states = True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [94]:
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [95]:
def bert_text_preparation(text):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = bert_tokenizer.tokenize(marked_text, truncation=True)
    indexed_tokens = bert_tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)
    # convert inputs to tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segments_ids])
    return tokenized_text, tokens_tensor, segments_tensor

In [96]:
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [97]:
def get_sentence_word_embeddings(tokens_tensor, segments_tensor):
    with torch.no_grad():
      # obtain hidden states
      outputs = bert_model(tokens_tensor, segments_tensor)
      hidden_states = outputs[2]
    # concatenate the tensors for all layers
    # use "stack" to create new dimension in tensor
    token_embeddings = torch.stack(hidden_states, dim=0)
    # remove dimension 1, the "batches"
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # swap dimensions 0 and 1 so we can loop over tokens
    token_embeddings = token_embeddings.permute(1,0,2)
    # intialized list to store embeddings
    token_vecs_sum = []
    # "token_embeddings" is a [Y x 12 x 768] tensor
    # where Y is the number of tokens in the sentence
    # loop over tokens in sentence
    for token in token_embeddings:
    # "token" is a [12 x 768] tensor
    # sum the vectors from the last four layers
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)
    return token_vecs_sum


In [98]:
text = "This is a test sentence"
tokenized_text, tokens_tensor, segments_tensor = bert_text_preparation(text)
print(tokenized_text)
embeddings  = get_sentence_word_embeddings(tokens_tensor, segments_tensor)
print(embeddings)
print(len(embeddings))

['[CLS]', 'this', 'is', 'a', 'test', 'sentence', '[SEP]']
[tensor([-1.2918e+00, -3.4288e-01, -8.7811e-01, -2.8823e+00, -1.6881e+00,
        -1.9190e+00,  1.4351e+00,  1.0858e+00,  1.2602e+00, -2.0788e+00,
        -1.1483e+00, -8.2739e-01, -1.2627e+00,  6.1478e-01, -7.9215e-01,
        -8.9600e-01, -7.8040e-01,  8.7334e-01,  4.3941e-01, -2.4041e+00,
        -7.1802e-01, -8.9348e-01, -2.0710e+00, -2.0920e+00,  9.2590e-01,
        -1.3476e+00, -9.0515e-01, -2.8579e-01, -1.9834e-02,  1.5233e+00,
         6.3897e-01,  1.8940e+00, -1.7642e+00,  1.0027e+00,  2.3684e+00,
         9.7974e-01, -3.8899e-01, -1.5129e-01, -1.5964e-01,  1.6982e+00,
        -1.7430e+00, -1.5724e-01,  1.0982e+00, -1.0403e+00, -1.3274e+00,
        -8.5357e-01, -1.6697e+01,  3.5604e-01,  3.7055e-04, -3.6944e+00,
        -1.2191e+00,  6.5086e-01, -3.4823e-02,  3.2367e+00,  1.4870e+00,
         4.6398e-01, -1.9974e+00, -6.2199e-01, -6.4532e-01, -4.1324e-01,
        -1.0280e+00,  2.4617e-01, -4.5789e-01,  6.0787e-02,  3.33

# FastText word embeddings

In [99]:
from gensim.models import FastText

In [100]:
def get_tokens(sentences):
    sentence_tokens = []
    for sentence in sentences:
        tokenized_sentence = bert_tokenizer.tokenize(sentence.lower())
        sentence_tokens.append([token.replace("##", "") for token in tokenized_sentence])
    return sentence_tokens



In [101]:
sentences =  get_tokens(bbc_df["text"])

Token indices sequence length is longer than the specified maximum sequence length for this model (846 > 512). Running this sequence through the model will result in indexing errors


In [103]:
model = FastText(sentences, vector_size= 768, window = 5, min_count= 1, workers= 4, epochs = 4, sg=1)

In [104]:
model.save("../model_outputs/fasttext_embeddings_bbc.ft")

In [105]:
fasttext_model = model.wv

# Build the classifier

In [106]:
from sklearn.linear_model import LogisticRegression


In [107]:
class LRClassifier():
    def __init__(self):
        self.model = LogisticRegression(verbose=1, max_iter=2)

    def train(self, train_X, train_Y, test_X, test_Y):
        start_time = datetime.now()
        self.model.fit(train_X,train_Y)
        end_time = datetime.now()
        training_time = (end_time - start_time).total_seconds()
        predicitons = self.model.predict(test_X)
        test_f1, test_precision, test_recall, test_accuracy = get_metrics(predicitons, test_Y)
        print('Testing: Accuracy: {:.3%}, Recall: {:.3%}, Precision: {:.3%}, f1: {:.3%}'.format(test_accuracy,test_recall, test_precision, test_f1))
        print('Training time: {:.2f}s'.format(training_time))
        return test_f1, test_precision, test_recall, test_accuracy 

# Concatenate the embeddings

In [108]:
def get_combined_sentence_embeddings(sentence, word_level_combination, sentence_level_combination):
    tokenized_text, tokens_tensor, segments_tensor = bert_text_preparation(sentence)
    bert_embeddings  = get_sentence_word_embeddings(tokens_tensor, segments_tensor)[1:-1]
    fastext_embeddings = [fasttext_model[word] for word in tokenized_text[1:-1]]
    word_embeddings = []
    for bert_embedding, fastext_embedding in zip(bert_embeddings, fastext_embeddings):
        fastext_embedding = torch.from_numpy(fastext_embedding)
        if word_level_combination == "cat":
            result = torch.cat((bert_embedding, fastext_embedding),dim=0) 
        elif word_level_combination == "sum":
            result = torch.sum(torch.stack((bert_embedding, fastext_embedding),dim=0),dim=0)
        word_embeddings.append(result)

    if sentence_level_combination == "cat":
        sentence_embeddings = torch.cat((word_embeddings[:]) ,dim=0)
    elif sentence_level_combination == "sum":
        sentence_embeddings = torch.sum(torch.stack((word_embeddings[:]),dim=0),dim=0)
    return sentence_embeddings.numpy()

In [109]:
def combine_sentence_embeddings(sentence, sentence_level_combination, inter_sentence_combination):
    tokenized_text, tokens_tensor, segments_tensor = bert_text_preparation(sentence)
    bert_embeddings  = get_sentence_word_embeddings(tokens_tensor, segments_tensor)[1:-1]
    fastext_embeddings = [fasttext_model[word] for word in tokenized_text[1:-1]]
    fastext_embeddings = [torch.from_numpy(x) for x in fastext_embeddings]
    if sentence_level_combination == "cat":
        bert_sentence_embeddings = torch.cat((bert_embeddings[:]) ,dim=0)
        fasttext_sentence_embeddings = torch.cat((fastext_embeddings[:]) ,dim=0)
    elif sentence_level_combination == "sum":
        bert_sentence_embeddings = torch.sum(torch.stack((bert_embeddings[:]),dim=0),dim=0)
        fasttext_sentence_embeddings = torch.sum(torch.stack((fastext_embeddings[:]),dim=0),dim=0)

    if inter_sentence_combination == "cat":
        sentence_embeddings = torch.cat((bert_sentence_embeddings, fasttext_sentence_embeddings) ,dim=0)
    elif inter_sentence_combination == "sum":
        sentence_embeddings = torch.sum(torch.stack((bert_sentence_embeddings, fasttext_sentence_embeddings),dim=0),dim=0)

    return sentence_embeddings.numpy()

In [110]:
text = "this is a test sentence"
sentence_embeddings = combine_sentence_embeddings(text, sentence_level_combination = "sum", inter_sentence_combination= "cat")
print(len(sentence_embeddings))
print(type(sentence_embeddings))
print(sentence_embeddings[0])

1536
<class 'numpy.ndarray'>
-3.939239


In [111]:
text = "this is a test sentence"
sentence_embeddings = get_combined_sentence_embeddings(text, word_level_combination = "sum", sentence_level_combination= "cat")
len(sentence_embeddings)
type(sentence_embeddings)

numpy.ndarray

# Train BBC data

In [None]:
f1s = []
recalls = []
precisions = []
word_level_combination = "cat"
sentence_level_combination = "sum"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(bbc_df, bbc_df["category"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    train_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in train_X["text"]]
    test_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in test_X["text"]]
    model = LRClassifier()
    test_f1, test_precision, test_recall, test_accuracy  = model.train(train_embeddings, train_Y, test_embeddings, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)

In [25]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.6849393964993584, 0.6875924282756581, 0.617622276413093, 0.6781569069069069, 0.6152517289178456]
Precision avg: 0.6567 (+/- 0.0739)
Recall values: [0.6012301587301587, 0.601656314699793, 0.5572657817564439, 0.5942920251104395, 0.5825491016919588]
Recall avg: 0.5874 (+/- 0.0371)
F1 values: [0.6390857768790023, 0.6337924158558806, 0.5683749534193782, 0.6151859981212778, 0.5759507952340742]
F1 avg: 0.6065 (+/- 0.0653)


In [31]:
f1s = []
recalls = []
precisions = []
word_level_combination = "sum"
sentence_level_combination = "sum"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(bbc_df, bbc_df["category"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    train_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in train_X["text"]]
    test_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in test_X["text"]]
    model = LRClassifier()
    test_f1, test_precision, test_recall, test_accuracy  = model.train(train_embeddings, train_Y, test_embeddings, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
  _warn_prf(average, modifier, msg_start, len(result))


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.86480D+03    |proj g|=  1.35634D+06
Testing: Accuracy: 65.471%, Recall: 54.417%, Precision: 66.421%, f1: 58.360%
Training time: 0.17s

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3845      2      8      1     0     0   1.288D+06   2.621D+03
  F =   2621.1324985046263     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.86480D+03    |proj g|=  1.37872D+06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3845      2      8      1     0     0   1.466D+06   2.595D+03
  F =   2595.4732407946008     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 69.058%, Recall: 56.788%, Precision: 67.088%, f1: 60.641%
Training time: 0.04s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.86480D+03    |proj g|=  1.41559D+06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3845      2      8      1     0     0   1.424D+06   2.601D+03
  F =   2601.4762697074398     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 65.919%, Recall: 55.051%, Precision: 65.927%, f1: 59.179%
Training time: 0.03s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.86480D+03    |proj g|=  1.30226D+06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3845      2      8      1     0     0   1.340D+06   2.620D+03
  F =   2619.6573304305903     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 70.852%, Recall: 59.445%, Precision: 70.707%, f1: 63.587%
Training time: 0.03s
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))


Precision values: [0.6642079423349438, 0.6708779061474826, 0.6592733367789358, 0.7070713505267281, 0.6905040841305923]
Precision avg: 0.6784 (+/- 0.0399)
Recall values: [0.5441729512317748, 0.5678756674294432, 0.5505064440224078, 0.5944536529389264, 0.5801923456952769]
Recall avg: 0.5674 (+/- 0.0415)
F1 values: [0.5836032388663968, 0.6064050444732046, 0.5917886897701867, 0.6358734960697848, 0.6274595263984535]
F1 avg: 0.6090 (+/- 0.0448)


In [112]:
f1s = []
recalls = []
precisions = []

inter_sentence_combination = "sum"
sentence_level_combination = "sum"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(bbc_df, bbc_df["category"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    train_embeddings = [combine_sentence_embeddings(x, sentence_level_combination, inter_sentence_combination) for x in train_X["text"]]
    test_embeddings = [combine_sentence_embeddings(x, sentence_level_combination, inter_sentence_combination) for x in test_X["text"]]
    model = LRClassifier()
    test_f1, test_precision, test_recall, test_accuracy  = model.train(train_embeddings, train_Y, test_embeddings, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
  _warn_prf(average, modifier, msg_start, len(result))


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.86480D+03    |proj g|=  1.36123D+06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3845      2      8      1     0     0   1.442D+06   2.602D+03
  F =   2602.3579899918996     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 72.197%, Recall: 57.666%, Precision: 67.950%, f1: 62.180%
Training time: 0.06s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.86480D+03    |proj g|=  1.36513D+06
Testing: Accuracy: 75.336%, Recall: 61.707%, Precision: 70.440%, f1: 65.227%
Training time: 0.02s

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3845      2      7      1     0     0   1.872D+06   2.628D+03
  F =   2627.7401174062511     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
  _warn_prf(average, modifier, msg_start, len(result))


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.86480D+03    |proj g|=  1.23741D+06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3845      2      7      1     0     0   2.198D+06   2.606D+03
  F =   2605.9363198215183     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 65.919%, Recall: 62.606%, Precision: 59.910%, f1: 56.072%
Training time: 0.07s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.86480D+03    |proj g|=  1.07204D+06

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 3845      2      8      1     0     0   1.911D+06   2.519D+03
  F =   2519.1130292136709     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 73.543%, Recall: 63.583%, Precision: 65.959%, f1: 63.222%
Training time: 0.04s
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         3845    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
  _warn_prf(average, modifier, msg_start, len(result))


In [114]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.6795019553715207, 0.7044032813839184, 0.5990950226244344, 0.6595875112089346, 0.6176051820119617]
Precision avg: 0.6520 (+/- 0.0868)
Recall values: [0.576656360588564, 0.6170673421045636, 0.6260606060606061, 0.635827067669173, 0.5498681096758122]
Recall avg: 0.6011 (+/- 0.0728)
F1 values: [0.6218024494727468, 0.6522721794332885, 0.560715569938871, 0.6322174461287087, 0.5688081786795453]
F1 avg: 0.6072 (+/- 0.0807)


# Train Movies data

In [55]:
f1s = []
recalls = []
precisions = []
word_level_combination = "sum"
sentence_level_combination = "sum"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(movies_df, movies_df["sentiment"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    train_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in train_X["text"]]
    test_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in test_X["text"]]
    model = LRClassifier()
    test_f1, test_precision, test_recall, test_accuracy  = model.train(train_embeddings, train_Y, test_embeddings, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  1.04385D+05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   2.511D+06   2.432D+03
  F =   2431.7653500072593     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 66.200%, Recall: 75.068%, Precision: 64.952%, f1: 61.841%
Training time: 0.20s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  9.99521D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      7      1     0     0   1.513D+06   2.364D+03
  F =   2363.6313772374606     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 65.000%, Recall: 70.026%, Precision: 65.099%, f1: 62.746%
Training time: 0.25s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finish

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  1.01797D+05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   2.317D+06   2.542D+03
  F =   2542.0750166572511     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 63.000%, Recall: 75.852%, Precision: 59.226%, f1: 53.605%
Training time: 0.02s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds
Testing: Accuracy: 65.000%, Recall: 74.749%, Precision: 64.489%, f1: 60.723%
Training time: 0.02s

At iterate    0    f=  2.77259D+03    |proj g|=  1.29531D+05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   2.057D+06   2.653D+03
  F =   2653.1261635736259     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [56]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.6495192307692308, 0.6509944159106545, 0.5922582520509011, 0.6448850905831893, 0.5019230769230769]
Precision avg: 0.6079 (+/- 0.1282)
Recall values: [0.7506784169375856, 0.700258891435362, 0.7585184849993212, 0.7474917302277263, 0.7404809619238477]
Recall avg: 0.7395 (+/- 0.0457)
F1 values: [0.6184050686645082, 0.6274550710815725, 0.5360501567398119, 0.6072270227808327, 0.32859461112925714]
F1 avg: 0.5435 (+/- 0.2509)


In [57]:
f1s = []
recalls = []
precisions = []
word_level_combination = "cat"
sentence_level_combination = "sum"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(movies_df, movies_df["sentiment"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    train_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in train_X["text"]]
    test_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in test_X["text"]]
    model = LRClassifier()
    test_f1, test_precision, test_recall, test_accuracy  = model.train(train_embeddings, train_Y, test_embeddings, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  9.79442D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1537      2      6      1     0     0   2.124D+06   2.628D+03
  F =   2627.6467544104758     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 52.600%, Recall: 75.000%, Precision: 54.943%, f1: 42.330%
Training time: 0.03s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  1.07893D+05
Testing: Accuracy: 75.400%, Recall: 77.236%, Precision: 75.452%, f1: 75.003%
Training time: 0.05s

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1537      2      7      1     0     0   2.576D+05   2.202D+03
  F =   2202.2136232423345     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  1.61941D+05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1537      2      6      1     0     0   1.673D+06   2.693D+03
  F =   2692.8504385550414     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 51.600%, Recall: 72.602%, Precision: 56.429%, f1: 44.019%
Training time: 0.03s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  1.14664D+05
Testing: Accuracy: 71.000%, Recall: 72.793%, Precision: 70.642%, f1: 70.178%
Training time: 0.04s

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1537      2      7      1     0     0   8.923D+05   2.382D+03
  F =   2381.9890035424933     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [58]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.5494296577946768, 0.7545160722571561, 0.5642945103376759, 0.7064228995901639, 0.7375238003808061]
Precision avg: 0.6624 (+/- 0.1961)
Recall values: [0.75, 0.7723614820902678, 0.7260195944406471, 0.7279259691361298, 0.752037351443124]
Recall avg: 0.7457 (+/- 0.0384)
F1 values: [0.42329873125720874, 0.7500315000955163, 0.4401880227996151, 0.7017823134288452, 0.734041473288425]
F1 avg: 0.6099 (+/- 0.3273)


In [60]:
f1s = []
recalls = []
precisions = []

inter_sentence_combination = "sum"
sentence_level_combination = "sum"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(movies_df, movies_df["sentiment"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    train_embeddings = [combine_sentence_embeddings(x, sentence_level_combination, inter_sentence_combination) for x in train_X["text"]]
    test_embeddings = [combine_sentence_embeddings(x, sentence_level_combination, inter_sentence_combination) for x in test_X["text"]]
    model = LRClassifier()
    test_f1, test_precision, test_recall, test_accuracy  = model.train(train_embeddings, train_Y, test_embeddings, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  1.00235D+05
Testing: Accuracy: 65.800%, Recall: 75.202%, Precision: 64.385%, f1: 61.026%
Training time: 0.09s

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   2.288D+06   2.435D+03
  F =   2434.7074694471848     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  9.83710D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   2.036D+06   2.628D+03
  F =   2628.4925802333491     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 58.400%, Recall: 76.294%, Precision: 57.213%, f1: 48.413%
Training time: 0.04s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  1.35635D+05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   1.786D+06   2.679D+03
  F =   2679.1956670379268     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 52.800%, Recall: 76.210%, Precision: 50.833%, f1: 36.031%
Training time: 0.05s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.77259D+03    |proj g|=  1.11423D+05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      7      1     0     0   2.148D+06   2.478D+03
  F =   2477.9899760344770     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 74.200%, Recall: 74.359%, Precision: 74.106%, f1: 74.100%
Training time: 0.06s
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [61]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.6438464867984418, 0.5721285487822453, 0.5083333333333333, 0.7410564225690276, 0.5619162640901771]
Precision avg: 0.6055 (+/- 0.1797)
Recall values: [0.7520222446916077, 0.7629443698558169, 0.7620967741935484, 0.7435947204968945, 0.6622979190409861]
Recall avg: 0.7366 (+/- 0.0846)
F1 values: [0.6102564102564103, 0.48412698412698413, 0.36030878653829473, 0.7410044209943021, 0.46266308746930324]
F1 avg: 0.5317 (+/- 0.2939)


# Train using Twitter data

In [82]:
f1s = []
recalls = []
precisions = []
word_level_combination = "sum"
sentence_level_combination = "sum"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(twitter_df, twitter_df["label"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    train_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in train_X["text"]]
    test_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in test_X["text"]]
    model = LRClassifier()
    test_f1, test_precision, test_recall, test_accuracy  = model.train(train_embeddings, train_Y, test_embeddings, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  1.00302D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      7      1     0     0   5.677D+03   2.321D+03
  F =   2320.6847934734442     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 64.375%, Recall: 64.392%, Precision: 64.411%, f1: 64.367%
Training time: 0.04s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  9.83035D+03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   4.855D+03   2.337D+03
  F =   2336.7772247535427     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 68.542%, Recall: 68.565%, Precision: 68.533%, f1: 68.525%
Training time: 0.03s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  1.60719D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   1.167D+05   2.526D+03
  F =   2526.0510596905860     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 58.958%, Recall: 64.745%, Precision: 59.214%, f1: 54.837%
Training time: 0.17s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  1.02043D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   8.623D+04   2.376D+03
  F =   2375.7083622911077     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 67.500%, Recall: 68.765%, Precision: 67.553%, f1: 66.984%
Training time: 0.06s
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [83]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.6441089031450478, 0.6853330786992831, 0.5921418154038475, 0.6755325613291898, 0.675696338899411]
Precision avg: 0.6546 (+/- 0.0764)
Recall values: [0.6439162441836239, 0.6856521739130435, 0.6474535300491789, 0.6876519608752947, 0.6757940854326396]
Recall avg: 0.6681 (+/- 0.0420)
F1 values: [0.6436742189094035, 0.6852513689915276, 0.5483701492537313, 0.6698412698412698, 0.6749943575409296]
F1 avg: 0.6444 (+/- 0.1117)


In [84]:
f1s = []
recalls = []
precisions = []
word_level_combination = "cat"
sentence_level_combination = "sum"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(twitter_df, twitter_df["label"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    train_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in train_X["text"]]
    test_embeddings = [get_combined_sentence_embeddings(x, word_level_combination, sentence_level_combination) for x in test_X["text"]]
    model = LRClassifier()
    test_f1, test_precision, test_recall, test_accuracy  = model.train(train_embeddings, train_Y, test_embeddings, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  1.03291D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1537      2      7      1     0     0   1.014D+04   2.319D+03
  F =   2319.2513221936870     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 69.167%, Recall: 69.178%, Precision: 69.141%, f1: 69.140%
Training time: 0.03s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  1.01292D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1537      2      6      1     0     0   4.711D+04   2.325D+03
  F =   2325.4043589732933     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 70.208%, Recall: 70.213%, Precision: 70.116%, f1: 70.127%
Training time: 0.09s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  1.04949D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1537      2      6      1     0     0   1.121D+05   2.465D+03
  F =   2465.3828619820811     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 63.542%, Recall: 68.340%, Precision: 64.954%, f1: 62.244%
Training time: 0.05s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  1.00288D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
 1537      2      6      1     0     0   1.833D+04   2.322D+03
  F =   2322.3345779912074     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 61.667%, Recall: 61.681%, Precision: 61.624%, f1: 61.600%
Training time: 0.04s
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =         1537    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [85]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.691410116163984, 0.7011607096314574, 0.6495424836601307, 0.6162421211647653, 0.6824944382647387]
Precision avg: 0.6682 (+/- 0.0698)
Recall values: [0.6917830859966249, 0.702126545149801, 0.6833960328317373, 0.6168100364676938, 0.6830065359477124]
Recall avg: 0.6754 (+/- 0.0674)
F1 values: [0.6914041458879949, 0.7012729844413013, 0.6224362748843711, 0.6160000000000001, 0.6811821958663083]
F1 avg: 0.6625 (+/- 0.0803)


In [86]:
f1s = []
recalls = []
precisions = []

inter_sentence_combination = "sum"
sentence_level_combination = "sum"
for i in range(5):
    train_X, val_X, train_Y, val_Y = train_test_split(twitter_df, twitter_df["label"], test_size = TRAIN_VAL_SIZE)
    val_X, test_X, val_Y, test_Y = train_test_split(val_X, val_Y, test_size = VAL_TEST_SIZE)
    train_embeddings = [combine_sentence_embeddings(x, sentence_level_combination, inter_sentence_combination) for x in train_X["text"]]
    test_embeddings = [combine_sentence_embeddings(x, sentence_level_combination, inter_sentence_combination) for x in test_X["text"]]
    model = LRClassifier()
    test_f1, test_precision, test_recall, test_accuracy  = model.train(train_embeddings, train_Y, test_embeddings, test_Y)
    f1s.append(test_f1)
    recalls.append(test_recall)
    precisions.append(test_precision)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  1.02370D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      7      1     0     0   8.597D+03   2.333D+03
  F =   2332.7650694024878     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 72.917%, Recall: 72.974%, Precision: 72.927%, f1: 72.905%
Training time: 0.03s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  9.32068D+03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      7      1     0     0   6.475D+03   2.346D+03
  F =   2345.7067247537211     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 71.667%, Recall: 71.687%, Precision: 71.687%, f1: 71.667%
Training time: 0.02s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  1.02227D+04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   9.279D+04   2.344D+03
  F =   2344.3330772948498     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 66.250%, Recall: 67.325%, Precision: 66.487%, f1: 65.909%
Training time: 0.02s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.66169D+03    |proj g|=  9.98811D+03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769      2      6      1     0     0   7.996D+04   2.382D+03
  F =   2382.1923274032656     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
Testing: Accuracy: 65.417%, Recall: 65.963%, Precision: 65.614%, f1: 65.281%
Training time: 0.02s
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769    

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [87]:
print("Precision values:", precisions)
print("Precision avg: %0.4f (+/- %0.4f)" % (statistics.mean(precisions), statistics.stdev(precisions) * 2))
print("Recall values:", recalls)
print("Recall avg: %0.4f (+/- %0.4f)" % (statistics.mean(recalls), statistics.stdev(recalls) * 2))
print("F1 values:", f1s)
print("F1 avg: %0.4f (+/- %0.4f)" % (statistics.mean(f1s), statistics.stdev(f1s) * 2))

Precision values: [0.7292661330925885, 0.7168657960544595, 0.6648719062092923, 0.6561392537002293, 0.6959572103362045]
Precision avg: 0.6926 (+/- 0.0636)
Recall values: [0.7297447763531029, 0.7168657960544595, 0.673249256264715, 0.6596334185848252, 0.6959163830821585]
Recall avg: 0.6951 (+/- 0.0584)
F1 values: [0.7290490664350846, 0.7166666666666666, 0.6590909090909092, 0.6528104575163398, 0.6958280525703572]
F1 avg: 0.6907 (+/- 0.0679)
