In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
from joblib import load, dump
import torch

In [43]:
model_log = torch.load("Objects/Models/model_log.pt")

# NLTK Vader Scores

In [2]:
vader_sentiments = load("Objects/Data/sentiments.joblib")
transformers = load("Objects/Data/transformers.joblib")
y = torch.load("Objects/Data/labels.pt")

In [3]:
vader_sentiments.pos.fillna(vader_sentiments.pos.mean(), inplace=True)
vader_sentiments.neg.fillna(vader_sentiments.neg.mean(), inplace=True)
vader_sentiments.neu.fillna(vader_sentiments.neu.mean(), inplace=True)
vader_sentiments.compound.fillna(vader_sentiments.compound.mean(), inplace=True)

In [4]:
y = y.numpy()

In [5]:
x_train, x_test, y_train, y_test = train_test_split(vader_sentiments, y, test_size=846, random_state=1)

lr = LogisticRegression()
_ = lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
accuracy_score(y_pred, y_test)

0.6028368794326241

# Huggingface Transformers

In [6]:
x_train, x_test, y_train, y_test = train_test_split(transformers, y, test_size=846, random_state=10
                                                   )

In [7]:
lr = LogisticRegression()
_ = lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
accuracy_score(y_pred, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8534278959810875

# Huggingface Transformers + NLTK vader scores

In [32]:
transformers_df = pd.DataFrame(transformers)

In [33]:
df = pd.concat([vader_sentiments, transformers_df], axis=1)

In [34]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=846, random_state=10)

In [41]:
lg = LogisticRegression()
_ = lg.fit(x_train, y_train)
y_pred = lg.predict(x_test)
y_proba = lg.predict_proba(x_test)
confusion = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_pred, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
auc = roc_auc_score(y_test, y_proba, multi_class='ovr')

In [37]:
auc

0.9404138413989361

In [38]:
confusion

array([[ 83,  17,   5],
       [ 13, 466,  35],
       [  4,  47, 176]])

# Log Model

In [39]:
def log_model(model, acc, **params):
    log_entry = {
        "model": model,
        "accuracy": acc,
        
    }
    for i in params:
        log_entry[i] = params[i]
    
    model_log.append(log_entry)

In [31]:
log_entry = {
    'features': 'huggingface transformers + vader scores',
    'model_type': 'logistic_regression',
    'confusion_matrix': confusion,
    'roc_auc': auc,
    'notes': 'Hugging Face Transformers'
}

In [44]:
log_model(lg, acc, **log_entry)

In [47]:
model_log[-1]['confusion_matrix']

array([[ 83,  17,   5],
       [ 13, 466,  35],
       [  4,  47, 176]])

In [48]:
model_log[0]['confusion_matrix']

array([[ 59,  29,  10],
       [ 10, 446,  39],
       [  6,  87, 160]])

In [50]:
torch.save(model_log, 'Objects/Models/model_log.pt')