In [1]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import pandas as pd
import numpy as np

In [2]:
from joblib import load, dump

In [3]:
import warnings
warnings.filterwarnings('ignore')

# BOW model

In [4]:
model_log = []

## Logistic Regression

In [5]:
df = pd.read_csv("Data/FinancialNewsData.csv", encoding="Windows-1252", names=["label", "headline"])

In [6]:
X = df.headline
y = df.label

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
#tunable params
# slover: ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’
# penalty: 'l1', 'l2', 'elasticnet'
# l1_ratio: 0.5 (only for 'elasticnet')
# C

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(penalty='l2', solver='lbfgs', C=1, max_iter=500))
])

_ = pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)
acc = accuracy_score(y_pred, y_test)
print(confusion_matrix(y_pred, y_test))
print(acc)

[[ 71  10   6]
 [ 40 516  97]
 [ 11  44 175]]
0.7855670103092783


## Model Log Entry

In [9]:
def log_model(model, acc, **params):
    log_entry = {
        "model": model,
        "accuracy": acc,
    }
    for i in params:
        log_entry[i] = params[i]
    
    model_log.append(log_entry)

In [10]:
model_params = {
    "model_name": "logistic regression", 
    "penalty": 'l2', "C": 1, 
    "solver": 'lbfgs'
}

In [11]:
log_model(pipeline, acc, **model_params)

## Hyperparameter Tuning for Logistic Regression

In [12]:
param_grid = {
    'classifier__penalty': ['l2'],
    'classifier__C': np.logspace(-5,5,20),
    'classifier__solver': ['newton-cg', 'sag','saga','lbfgs']
}

In [13]:
grid_search = GridSearchCV(pipeline, param_grid, cv=2)

In [14]:
classifier = grid_search.fit(X, y)

In [15]:
print(f'Best Score: {classifier.best_score_}')
print(f'Best Parameters: {classifier.best_params_}')

Best Score: 0.6510524143623607
Best Parameters: {'classifier__C': 0.5455594781168515, 'classifier__penalty': 'l2', 'classifier__solver': 'sag'}


In [16]:
logistic_tuned = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(penalty='l2', solver='sag', C=0.55, max_iter=500))
])
_ = pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)
acc = accuracy_score(y_pred, y_test)
print(confusion_matrix(y_pred, y_test))
print(acc)

[[ 71  10   6]
 [ 40 516  97]
 [ 11  44 175]]
0.7855670103092783


In [17]:
model_params = {
    "model_name": "logistic regression", 
    "penalty": 'l2', "C": 0.55, 
    "solver": 'sag'
}

In [18]:
log_model(logistic_tuned, acc, **model_params)

## XGB model

In [19]:
xgb_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', xgb.XGBClassifier())
])
    

In [20]:
_ = xgb_pipeline.fit(x_train, y_train)
y_pred = xgb_pipeline.predict(x_test)
acc = accuracy_score(y_pred, y_test)
acc



0.7845360824742268

In [21]:
xgb_params = {
    "model_name": "XGBClassifier",
}

In [22]:
log_model(xgb_pipeline, acc, **xgb_params)

 we can tune the XGB, but I don't see it being necessary. It achieves very similar results as Logistic regression. I just wanted to test out a tree based model to see if there were any major improvements

# Save Model Log

In [23]:
pd.DataFrame().from_dict(model_log)

Unnamed: 0,model,accuracy,model_name,penalty,C,solver
0,"(CountVectorizer(), LogisticRegression(C=1, ma...",0.785567,logistic regression,l2,1.0,lbfgs
1,"(CountVectorizer(), LogisticRegression(C=0.55,...",0.785567,logistic regression,l2,0.55,sag
2,"(CountVectorizer(), XGBClassifier(base_score=0...",0.784536,XGBClassifier,,,


In [24]:
dump(model_log, "Objects/Models/model_log.joblib")

['Objects/Models/model_log.joblib']