In [27]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import pandas as pd
import numpy as np
import torch

In [28]:
from joblib import load, dump

In [29]:
import warnings
warnings.filterwarnings('ignore')

# BOW model

In [30]:
model_log = torch.load("Objects/Models/model_log.pt")

## Logistic Regression

In [31]:
df = pd.read_csv("Data/FinancialNewsData.csv", encoding="Windows-1252", names=["label", "headline"])

In [32]:
X = df.headline
y = df.label

In [33]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=846, random_state=1)

In [34]:
#tunable params
# slover: ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’
# penalty: 'l1', 'l2', 'elasticnet'
# l1_ratio: 0.5 (only for 'elasticnet')
# C

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(penalty='l2', solver='lbfgs', C=1, max_iter=500))
])

_ = pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)
y_proba = pipeline.predict_proba(x_test)
acc = accuracy_score(y_pred, y_test)
confusion = confusion_matrix(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba, multi_class='ovr')
print(acc)

0.7860520094562647


In [35]:
auc

0.8970899128900728

## Model Log Entry

In [37]:
def log_model(model, acc, **params):
    log_entry = {
        "model": model,
        "accuracy": acc,
    }
    for i in params:
        log_entry[i] = params[i]
    
    model_log.insert(0, log_entry)

In [38]:
model_params = {
    "model_name": "logistic regression", 
    "penalty": 'l2', "C": 1, 
    "solver": 'lbfgs',
    'auc': auc,
    "confusion_matrix": confusion
}

In [39]:
log_model(pipeline, acc, **model_params)

In [40]:
model_log

[{'model': Pipeline(steps=[('vectorizer', CountVectorizer()),
                  ('classifier', LogisticRegression(C=1, max_iter=500))]),
  'accuracy': 0.7860520094562647,
  'model_name': 'logistic regression',
  'penalty': 'l2',
  'C': 1,
  'solver': 'lbfgs',
  'auc': 0.8970899128900728,
  'confusion_matrix': array([[ 59,  29,  10],
         [ 10, 446,  39],
         [  6,  87, 160]])},
 {'model': Pipeline(steps=[('vectorizer', CountVectorizer()),
                  ('classifier',
                   LogisticRegression(C=0.55, max_iter=500, solver='sag'))]),
  'accuracy': 0.7855670103092783,
  'model_name': 'logistic regression',
  'penalty': 'l2',
  'C': 0.55,
  'solver': 'sag',
  'confusion_matrix': array([[ 71,  10,   6],
         [ 40, 516,  97],
         [ 11,  44, 175]])},
 {'model': Pipeline(steps=[('vectorizer', CountVectorizer()),
                  ('classifier',
                   XGBClassifier(base_score=0.5, booster='gbtree',
                                 colsample_bylevel

## Hyperparameter Tuning for Logistic Regression

In [13]:
param_grid = {
    'classifier__penalty': ['l2'],
    'classifier__C': np.logspace(-5,5,20),
    'classifier__solver': ['newton-cg', 'sag','saga','lbfgs']
}

In [14]:
grid_search = GridSearchCV(pipeline, param_grid, cv=2)

In [15]:
classifier = grid_search.fit(X, y)

In [16]:
print(f'Best Score: {classifier.best_score_}')
print(f'Best Parameters: {classifier.best_params_}')

Best Score: 0.6510524143623607
Best Parameters: {'classifier__C': 0.5455594781168515, 'classifier__penalty': 'l2', 'classifier__solver': 'sag'}


In [17]:
logistic_tuned = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(penalty='l2', solver='sag', C=0.55, max_iter=500))
])
_ = pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)
acc2 = accuracy_score(y_pred, y_test)
confusion2 = confusion_matrix(y_pred, y_test)
print(acc)

0.7855670103092783


In [18]:
model_params = {
    "model_name": "logistic regression", 
    "penalty": 'l2', "C": 0.55, 
    "solver": 'sag',
    "confusion_matrix": confusion2
}

In [19]:
log_model(logistic_tuned, acc, **model_params)

## XGB model

In [20]:
xgb_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', xgb.XGBClassifier())
])
    

In [21]:
_ = xgb_pipeline.fit(x_train, y_train)
y_pred = xgb_pipeline.predict(x_test)
acc = accuracy_score(y_pred, y_test)
confusion3 = confusion_matrix(y_pred, y_test)
acc



0.7845360824742268

In [22]:
xgb_params = {
    "model_name": "XGBClassifier",
    "confusion_matrix": confusion3
}

In [23]:
log_model(xgb_pipeline, acc, **xgb_params)

 we can tune the XGB, but I don't see it being necessary. It achieves very similar results as Logistic regression. I just wanted to test out a tree based model to see if there were any major improvements

# Save Model Log

In [41]:
pd.DataFrame().from_dict(model_log)

Unnamed: 0,model,accuracy,model_name,penalty,C,solver,auc,confusion_matrix,layers,hidden_sizes,activation,dropout,optimizer,learning_rate,epochs,notes,features,model_type,num_layers,roc_auc
0,"(CountVectorizer(), LogisticRegression(C=1, ma...",0.786052,logistic regression,l2,1.0,lbfgs,0.89709,"[[59, 29, 10], [10, 446, 39], [6, 87, 160]]",,,,,,,,,,,,
1,"(CountVectorizer(), LogisticRegression(C=0.55,...",0.785567,logistic regression,l2,0.55,sag,,"[[71, 10, 6], [40, 516, 97], [11, 44, 175]]",,,,,,,,,,,,
2,"(CountVectorizer(), XGBClassifier(base_score=0...",0.784536,XGBClassifier,,,,,"[[70, 9, 5], [41, 525, 107], [11, 36, 166]]",,,,,,,,,,,,
3,"{'conv1.weight': [[tensor([[-1.3640e-01, 5.65...",0.669031,CNN,,,,,,"[Conv, MaxPool, Linear]","[None, None, None]","[None, None, None]","[None, None, None]",SGD,0.001,50.0,Baseline CNN Overfits and doesn't backpropagat...,,,,
4,"{'conv1.weight': [[tensor([[ 2.8236e-01, -3.37...",0.683215,CNN,,,,,,"[Conv, MaxPool, Linear]","[None, None, None]","[None, None, None]","[None, None, None]",SGD,0.003,50.0,Increased learning rate due to slow training,,,,
5,"{'conv1.weight': [[tensor([[-0.3670, -0.0061, ...",0.658392,CNN,,,,,,"[Conv, MaxPool, Linear]","[None, None, None]","[None, None, None]","[None, None, None]",SGD,0.05,50.0,Increased learning rate again after small impr...,,,,
6,"{'conv1.weight': [[tensor([[ 1.4144e+00, 4.39...",0.666667,CNN,,,,,,"[Conv, MaxPool, Linear]","[None, None, None]","[None, None, None]","[None, None, None]",Adam,0.05,50.0,Changed optimizer to Adam,,,,
7,"{'conv1.weight': [[tensor([[ 0.3316, 0.0229, ...",0.669031,CNN,,,,,,"[Conv, MaxPool, Linear, Linear]","[None, None, 64, 3]","[None, None, None, None]","[None, None, None, None]",SGD,0.05,50.0,Added Linear layer,,,,
8,"{'conv1.weight': [[tensor([[ 3.4375e-01, 1.50...",0.670213,CNN,,,,,,"[Conv, MaxPool, Linear, Linear]","[None, None, 64, 3]","[None, None, None, None]","[None, None, None, None]",SGD,0.05,50.0,Added Linear layer,,,,
9,"{'conv1.weight': [[tensor([[ 0.3271, 0.0599, ...",0.667849,CNN,,,,,"[[11, 1, 11], [40, 434, 113], [67, 49, 120]]","[Conv, MaxPool, Linear, Linear, Linear]","[None, None, 64, 64, 3]","[None, None, None, None, None]","[None, None, 0.2, None, None]",SGD,0.05,50.0,Added Dropout layer after the first linear layer,,,,


In [42]:
torch.save(model_log, "Objects/Models/model_log.pt")