# <span style="color:red"> Main Modelling - Logistic Regression - Gridsearch </span>

* read in pickle v02
* keep 3 translators
* try different feature sets and save model each time
* compare results at the end
* all features available ie normalised counts and pos counts with some drops plus unique words, adj, adv

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import textacy
import re
import pickle
import os

from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, classification_report, average_precision_score, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import label_binarize

import scikitplot as skplt
from matplotlib.colors import ListedColormap
cmap = ListedColormap(sns.color_palette("husl", 3))

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# extend limit of number of rows and columns to display in cell
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100


Bad key "text.kerning_factor" on line 4 in
/Users/Steven/opt/anaconda3/envs/textacy/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


### Load the dataframe containing text chunks and related variables

In [2]:
in_full_path = '../../../../Documents/murakami/pkl3/df_all_v02.pkl'
# read back pickle
with open (in_full_path, 'rb') as fp:
    df = pickle.load(fp)

### Check the dataframe is as expected

In [3]:
df.head(1)

Unnamed: 0,book_chunk_no,number,title,text,fstop_indices,split_indices,chunks,translator,book_title,parsed,n_sents,n_words,n_chars,n_syllables,n_unique_words,n_long_words,n_monosyllable_words,n_polysyllable_words,trans_code,chunk_length,n_sents_norm,n_words_norm,n_chars_norm,n_syllables_norm,n_unique_words_norm,n_long_words_norm,n_monosyllable_words_norm,n_polysyllable_words_norm,vader_compound,vader_neg,vader_neu,vader_pos,pron_count,verb_count,det_count,adj_count,num_count,punct_count,noun_count,adp_count,cconj_count,sconj_count,adv_count,aux_count,part_count,propn_count,space_count,intj_count,sym_count,x_count,...,then_adv,more_adv,even_adv,why_adv,maybe_adv,again_adv,now_adv,just_adv,how_adv,where_adv,very_adv,only_adv,there_adv,still_adv,so_adv,too_adv,when_adv,all_adv,here_adv,never_adv,as_adv,new_adj,other_adj,more_adj,small_adj,deep_adj,whole_adj,first_adj,bad_adj,little_adj,next_adj,much_adj,own_adj,hard_adj,last_adj,only_adj,big_adj,right_adj,long_adj,old_adj,strange_adj,same_adj,young_adj,sure_adj,able_adj,real_adj,different_adj,good_adj,few_adj,vlong_words_count
0,0,1,Wednesday Afternoon Picnic,IT WAS A short one-paragraph item in the morn...,"[57, 97, 115, 196, 318, 385, 420, 445, 504, 65...","[967, 1924, 2998, 3982, 4935, 5975, 6995, 7961...",IT WAS A short one-paragraph item in the morni...,Alfred Birnbaum,A Wild Sheep Chase,"(IT, WAS, A, short, one, -, paragraph, item, i...",15,174,742,240,116,33,128,17,0,944,15.889831,184.322034,786.016949,254.237288,122.881356,34.957627,135.59322,18.008475,-0.4798,0.075,0.862,0.064,18.0,20.0,31.0,9.0,2.0,33.0,51.0,19.0,6.0,3.0,5.0,5.0,3.0,2.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [4]:
df.shape

(5212, 142)

In [5]:
df.columns

Index(['book_chunk_no', 'number', 'title', 'text', 'fstop_indices',
       'split_indices', 'chunks', 'translator', 'book_title', 'parsed',
       ...
       'strange_adj', 'same_adj', 'young_adj', 'sure_adj', 'able_adj',
       'real_adj', 'different_adj', 'good_adj', 'few_adj',
       'vlong_words_count'],
      dtype='object', length=142)

### Check the baseline

In [6]:
baseline_acc = df['trans_code'].value_counts(normalize=True).max()
baseline_acc

0.39927091327705294

## Features = Basic Counts

### Choose features to include in modelling

In [None]:
def feature_select(basic_counts=True, vader=False, pos_counts=False, 
                   words=False, adv=False, adj=False):
    '''create column list depending on features to include in the modelling'''
    columns = []
    if basic_counts:
        columns += [i for i in df.columns if i.startswith('n_') & i.endswith('_norm')]
    if vader:
        columns += [i for i in df.columns if i.startswith('vader_')]
    if pos_counts:
        columns += [i for i in df.columns if i.endswith('_count_norm')]
    if words:
        columns += [i for i in df.columns if i.endswith('_w')]
    if adj:
        columns += [i for i in df.columns if i.endswith('_adj')]
    if adv:
        columns += [i for i in df.columns if i.endswith('_adv')]
    return columns

In [None]:
predictor_cols = feature_select()

### Drop some columns which may be linked to page formatting

In [None]:
cols_to_drop = ['num_count_norm', 'punct_count_norm','space_count_norm', 'sym_count_norm', 'x_count_norm']

In [None]:
predictor_cols = [x for x in predictor_cols if x not in cols_to_drop]

### Prepare Target, Predictors
* set X, y based on selected columns
* perform train test split
* normalise predictor variables

In [None]:
def modelling_prep(df, predictor_cols, target_col):
#     set predictor and target variables
    X = df[predictor_cols]
    y = df[target_col]
#     perform train test split, including original indices before shuffling
    indices = list(df.index)
    X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, indices, test_size=0.2, stratify=y, random_state=1)
#     normalise the predictor variables 
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test, idx_train, idx_test

In [None]:
X_train, X_test, y_train, y_test, idx_train, idx_test = modelling_prep(df, predictor_cols, 'trans_code')

###  Gridsearch Logistic Regression 

In [None]:
def lr_mc_gridsearch(X_train, y_train):
    # set model
    model = LogisticRegression(solver='liblinear', multi_class='ovr', max_iter=1000)
    # set typical grid search parameters
#     params = {'C': np.logspace(-4, 4, 10),
#           'penalty': ['l1', 'l2'],
#           'fit_intercept': [True, False]}
    params = {'C': [0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2']}

    # instantiate model
    model_gs = GridSearchCV(estimator=model,
                  param_grid=params,
                  cv=5,
                  scoring='accuracy',
                  return_train_score=True)
    # fit the model
    model_gs.fit(X_train, y_train)
    return model_gs

In [None]:
def gridsearch_score(model, X_train, y_train, X_test, y_test):
    
    # print the grid search results and store as a dictionary
    results_dict = {}
    results_dict['Best_Parameters'] = model.best_params_
    results_dict['Best_CV_Score'] = model.best_score_
    results_dict['Best_Train_Score'] = model.score(X_train, y_train)
    results_dict['Best_Test_Score'] = model.score(X_test, y_test)

    print('Best Parameters:')
    print(results_dict['Best_Parameters'])
    print('Best estimator mean cross validated training score:')
    print(results_dict['Best_CV_Score'])
    print('Best estimator score on the full training set:')
    print(results_dict['Best_Train_Score'])
    print('Best estimator score on the test set:')
    print(results_dict['Best_Test_Score'])
    print('ROC-AUC score on the test set:')
    
    y_bin = label_binarize(y_test, model.classes_)
    for i, class_ in enumerate(model.classes_):
        print('Class {}:'.format(class_), round(roc_auc_score(y_bin[:,i],model.predict_proba(X_test)[:,i]),2))
    results_dict['AUC_Class_0'] = roc_auc_score(y_bin[:,0],model.predict_proba(X_test)[:,0])
    results_dict['AUC_Class_1'] = roc_auc_score(y_bin[:,1],model.predict_proba(X_test)[:,1])
    results_dict['AUC_Class_2'] = roc_auc_score(y_bin[:,2],model.predict_proba(X_test)[:,2])
    predictions = model.predict(X_test)
    results_dict['conmat'] = confusion_matrix(
        y_test, predictions, labels=[0, 1, 2])

    return results_dict

In [None]:
lreg_gs_01 = lr_mc_gridsearch(X_train, y_train)

In [None]:
lreg_gs_01_results = gridsearch_score(lreg_gs_01, X_train, y_train, X_test, y_test)

In [None]:
lreg_gs_01_results

#### There is a very slight improvement over the basic logistic regression - optimizing parameters with gridsearch does not improve accuracy significantly

### Save the model

In [None]:
# set the output path and name
out_path = '../../../../Documents/murakami/pkl_models/'
out_name = 'lreg_gs_01'
out_full_path = out_path + out_name + '.pkl'

# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(lreg_gs_01, fp)

## Features = Basic Counts + POS Counts

In [None]:
predictor_cols = feature_select(basic_counts=True, pos_counts=True)
predictor_cols = [x for x in predictor_cols if x not in cols_to_drop]

In [None]:
X_train, X_test, y_train, y_test, idx_train, idx_test = modelling_prep(df, predictor_cols, 'trans_code')

In [None]:
lreg_gs_02 = lr_mc_gridsearch(X_train, y_train)
lreg_gs_02_results = lr_gridsearch_score(lreg_gs_02, X_train, y_train, X_test, y_test)

In [None]:
# set the output path and name
out_path = '../../../../Documents/murakami/pkl_models/'
out_name = 'lreg_gs_02'
out_full_path = out_path + out_name + '.pkl'

# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(lreg_gs_02, fp)

## Features = Basic Counts + POS Counts + words

In [None]:
predictor_cols = feature_select(basic_counts=True, pos_counts=True, words=True)
predictor_cols = [x for x in predictor_cols if x not in cols_to_drop]

In [None]:
X_train, X_test, y_train, y_test, idx_train, idx_test = modelling_prep(df, predictor_cols, 'trans_code')

In [None]:
lreg_gs_03 = lr_mc_gridsearch(X_train, y_train)
lreg_gs_03_results = lr_gridsearch_score(lreg_gs_03, X_train, y_train, X_test, y_test)

In [None]:
# set the output path and name
out_path = '../../../../Documents/murakami/pkl_models/'
out_name = 'lreg_gs_03'
out_full_path = out_path + out_name + '.pkl'

# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(lreg_gs_03, fp)

## Features = Basic Counts + POS Counts + words + adj

In [None]:
predictor_cols = feature_select(basic_counts=True, pos_counts=True, words=True, adj=True)
predictor_cols = [x for x in predictor_cols if x not in cols_to_drop]

In [None]:
X_train, X_test, y_train, y_test, idx_train, idx_test = modelling_prep(df, predictor_cols, 'trans_code')

In [None]:
lreg_gs_04 = lr_mc_gridsearch(X_train, y_train)
lreg_gs_04_results = lr_gridsearch_score(lreg_gs_04, X_train, y_train, X_test, y_test)

In [None]:
# set the output path and name
out_path = '../../../../Documents/murakami/pkl_models/'
out_name = 'lreg_gs_04'
out_full_path = out_path + out_name + '.pkl'

# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(lreg_gs_04, fp)

## Features = Basic Counts + POS Counts + words + adj + adv

In [None]:
predictor_cols = feature_select(basic_counts=True, pos_counts=True, words=True, adj=True, adv=True)
predictor_cols = [x for x in predictor_cols if x not in cols_to_drop]

In [None]:
X_train, X_test, y_train, y_test, idx_train, idx_test = modelling_prep(df, predictor_cols, 'trans_code')

In [None]:
lreg_gs_05 = lr_mc_gridsearch(X_train, y_train)
lreg_gs_05_results = lr_gridsearch_score(lreg_gs_05, X_train, y_train, X_test, y_test)

In [None]:
# set the output path and name
out_path = '../../../../Documents/murakami/pkl_models/'
out_name = 'lreg_gs_05'
out_full_path = out_path + out_name + '.pkl'

# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(lreg_gs_05, fp)

## Features = Basic Counts + POS Counts + words + adj + adv + vader

In [None]:
predictor_cols = feature_select(basic_counts=True, pos_counts=True, vader=True, words=True, adj=True, adv=True)
predictor_cols = [x for x in predictor_cols if x not in cols_to_drop]

In [None]:
X_train, X_test, y_train, y_test, idx_train, idx_test = modelling_prep(df, predictor_cols, 'trans_code')

In [None]:
lreg_gs_06 = lr_mc_gridsearch(X_train, y_train)
lreg_gs_06_results = lr_gridsearch_score(lreg_gs_06, X_train, y_train, X_test, y_test)

In [None]:
# set the output path and name
out_path = '../../../../Documents/murakami/pkl_models/'
out_name = 'lreg_gs_06'
out_full_path = out_path + out_name + '.pkl'

# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(lreg_gs_06, fp)

## Confusion Matrices

In [None]:
lreg_results = [lreg_gs_01_results,
                lreg_gs_02_results,
                lreg_gs_03_results,
                lreg_gs_04_results,
                lreg_gs_05_results,
                lreg_gs_06_results,]

In [None]:
for i, results in enumerate(lreg_results):
    print(i+1)
    print(pd.DataFrame(results['conmat'], index=['actual birnbaum', 'actual rubin', 'actual gabriel'],
                             columns=['predicted birnbaum', 'predicted rubin', 'predicted gabriel']))

## Confusion Matrix Comments
* Model 1 - basic counts: does very well for Birnbaum and Rubin but very badly for Gabriel
* Model 2 - basic + POS counts: slight improvement for Birnbaum, a slight drop for Rubin. Gabriel is much improved but still the lowest accuracy
* Model 3 - basic + POS + word counts: Gabriel accuracy imrpoves significantly
* Model 4 - basic + POS + word counts + adj: no significant improvement on the test scores
* Model 5 - basic + POS + word counts + adj + adv: slight improvement across the board
* Model 6 - basic + POS + word counts + adj + adv + vader: no significant improvement on the test scores

## Classification Report
!!! to be set up!!!

In [None]:
# predictions = lreg_gs_01.predict(X_test)
# print(classification_report(y_test, predictions))

In [None]:
model_nums = []
cv_scores = []
train_scores = []
test_scores = []
auc_0 = []
auc_1 = []
auc_2 = []


for i, results in enumerate(lreg_results):
    model_nums.append(i+1)
    cv_scores.append(results['Best_CV_Score'])
    train_scores.append(results['Best_Train_Score'])
    test_scores.append(results['Best_Test_Score'])
    auc_0.append(results['AUC_Class_0'])
    auc_1.append(results['AUC_Class_1'])
    auc_2.append(results['AUC_Class_2'])

In [None]:
sns.set_palette(['gray', 'tomato', 'darkred', 'black'])
dict_cv_scores = {'model': model_nums, 
                  'cv_scores': cv_scores, 
                  'train_scores': train_scores,
                  'test_scores': test_scores,
                  'auc_0': auc_0,
                  'auc_1': auc_1,
                  'auc_2': auc_2,}
df_cv_scores = pd.DataFrame(dict_cv_scores)
df_cv_scores['baseline'] = baseline_acc

fig, ax = plt.subplots(nrows=2, figsize=(12,12))


df_cv_scores.sort_values(by='model').plot(x='model', y='cv_scores', linestyle='--', marker='o', ax=ax[0])
df_cv_scores.sort_values(by='model').plot(x='model', y='train_scores', linestyle='--', marker='o', ax=ax[0])
df_cv_scores.sort_values(by='model').plot(x='model', y='test_scores', linestyle='--', marker='o', ax=ax[0])
df_cv_scores.sort_values(by='model').plot(x='model', y='baseline', linestyle='--', marker='o', ax=ax[0])

df_cv_scores.sort_values(by='model').plot(x='model', y='auc_0', linestyle='--', marker='o', ax=ax[1])
df_cv_scores.sort_values(by='model').plot(x='model', y='auc_1', linestyle='--', marker='o', ax=ax[1])
df_cv_scores.sort_values(by='model').plot(x='model', y='auc_2', linestyle='--', marker='o', ax=ax[1])

labels = ['', 'basic_counts', 'POS_counts', 'sel_words', 'sel_adj', 'sel_adv', 'sentiment']
# labels = [item.get_text() for item in ax.get_xticklabels()]
# labels[1] = 'Testing'

ax[0].set_xticklabels(labels)
ax[0].tick_params(axis='x', labelrotation=45)
ax[1].set_xticklabels(labels)
ax[1].tick_params(axis='x', labelrotation=45)




plt.savefig('../../../../Documents/murakami/plots/lreg_sel_feature_scores.png')
plt.show()

## Run a best logistic regression

In [None]:
lreg_gs_06.best_params_

In [None]:
predictor_cols = feature_select(basic_counts=True, pos_counts=True, vader=False, words=True, adj=True, adv=True)
predictor_cols = [x for x in predictor_cols if x not in cols_to_drop]

X = df[predictor_cols]
y = df['trans_code']

indices = list(df.index)

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, indices, test_size=0.2, stratify=y, random_state=1)

scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

lreg_final = LogisticRegression(C=1, penalty='l1', solver='liblinear', multi_class='ovr', max_iter=1000)

lreg_final.fit(X_train, y_train)
predictions = lreg_final.predict(X_test)
lreg_final.score(X_test, y_test)

In [None]:
# code could be useful for refactoring some of the code above - extracting from tuples/dicts etc to df
# df_pred = pd.DataFrame([(x.r_ui, x.est) for x in predictions_full],
#                        columns=['Rating', 'Predicted'])

In [None]:
# set the output path and name
out_path = '../../../../Documents/murakami/pkl_models/'
out_name = 'lreg_final'
out_full_path = out_path + out_name + '.pkl'

# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(lreg_final, fp)

## Precision-Recall Plot

In [None]:
probabilities = lreg_final.predict_proba(X_test)

In [None]:
def plot_f1_lines(figsize=(8,6),fontsize=16):
    '''Create f1-score level lines to be added to the precison-recall plot'''

    fig, ax = plt.subplots(figsize=figsize)
    
    # add lines of constant F1 scores
    
    for const in np.linspace(0.2,0.9,8):
        x_vals = np.linspace(0.001, 0.999, 100)
        y_vals = 1./(2./const-1./x_vals)
        ax.plot(x_vals[y_vals > 0], y_vals[y_vals > 0],
                 color='lightblue', ls='--', alpha=0.9)
        ax.set_ylim([0, 1])
        ax.annotate('f1={0:0.1f}'.format(const),
                     xy=(x_vals[-10], y_vals[-2]+0.0), fontsize=fontsize)

    return fig, ax

In [None]:
fig, ax = plot_f1_lines()
skplt.metrics.plot_precision_recall(y_test, probabilities, 
                       plot_micro=True, 
                       title_fontsize=20, text_fontsize=16, cmap=cmap, ax=ax)
ax.legend(loc=[1.1,0])
plt.show()

In [None]:
# label binarizer - not sure if needed?
y_bin = label_binarize(y_test, lreg_final.classes_)

In [None]:
print('Area under precision-recall curve:')
for i, class_ in enumerate(lreg_final.classes_):
    print('Class {}:'.format(class_), round(average_precision_score(y_bin[:,i],lreg_final.predict_proba(X_test)[:,i]),4))

## ROC Plot

In [None]:
skplt.metrics.plot_roc(y_test, probabilities, plot_micro=True, plot_macro=True, 
                       title_fontsize=20, text_fontsize=16, figsize=(8,8), cmap=cmap)
plt.show()

In [None]:
print('Area under ROC curve (ROC-AUC):')
for i, class_ in enumerate(lreg_final.classes_):
    print('Class {}:'.format(class_), round(roc_auc_score(y_bin[:,i],lreg_final.predict_proba(X_test)[:,i]),2))

In [None]:
# skplt.metrics.roc_curve(y_test==0, probabilities[:,0])