# <span style="color:red"> Main Modelling - Logistic Regression - Drop Features </span>

* As seen from the initial modelling the features counting the words 'he' and 'had' are potentially more linked to specific novels rather than the translation style
* These 2 features will be dropped to reduce the risk of modelling on the novel rather than the translator

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import textacy
import re
import pickle
import os

from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, classification_report, average_precision_score, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import label_binarize

import scikitplot as skplt
from matplotlib.colors import ListedColormap
cmap = ListedColormap(sns.color_palette("husl", 3))

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# import class to run modelling steps
from translator_modelling import Modelling

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# extend limit of number of rows and columns to display in cell
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100


Bad key "text.kerning_factor" on line 4 in
/Users/Steven/opt/anaconda3/envs/textacy/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


### Load the dataframe containing text chunks and related variables

In [2]:
in_full_path = '../../../../Documents/murakami/pkl3/df_all_v02.pkl'
# read back pickle
with open (in_full_path, 'rb') as fp:
    df = pickle.load(fp)

### Check the dataframe is as expected

In [3]:
df.head(1)

Unnamed: 0,book_chunk_no,number,title,text,fstop_indices,split_indices,chunks,translator,book_title,parsed,n_sents,n_words,n_chars,n_syllables,n_unique_words,n_long_words,n_monosyllable_words,n_polysyllable_words,trans_code,chunk_length,n_sents_norm,n_words_norm,n_chars_norm,n_syllables_norm,n_unique_words_norm,n_long_words_norm,n_monosyllable_words_norm,n_polysyllable_words_norm,vader_compound,vader_neg,vader_neu,vader_pos,pron_count,verb_count,det_count,adj_count,num_count,punct_count,noun_count,adp_count,cconj_count,sconj_count,adv_count,aux_count,part_count,propn_count,space_count,intj_count,sym_count,x_count,...,then_adv,more_adv,even_adv,why_adv,maybe_adv,again_adv,now_adv,just_adv,how_adv,where_adv,very_adv,only_adv,there_adv,still_adv,so_adv,too_adv,when_adv,all_adv,here_adv,never_adv,as_adv,new_adj,other_adj,more_adj,small_adj,deep_adj,whole_adj,first_adj,bad_adj,little_adj,next_adj,much_adj,own_adj,hard_adj,last_adj,only_adj,big_adj,right_adj,long_adj,old_adj,strange_adj,same_adj,young_adj,sure_adj,able_adj,real_adj,different_adj,good_adj,few_adj,vlong_words_count
0,0,1,Wednesday Afternoon Picnic,IT WAS A short one-paragraph item in the morn...,"[57, 97, 115, 196, 318, 385, 420, 445, 504, 65...","[967, 1924, 2998, 3982, 4935, 5975, 6995, 7961...",IT WAS A short one-paragraph item in the morni...,Alfred Birnbaum,A Wild Sheep Chase,"(IT, WAS, A, short, one, -, paragraph, item, i...",15,174,742,240,116,33,128,17,0,944,15.889831,184.322034,786.016949,254.237288,122.881356,34.957627,135.59322,18.008475,-0.4798,0.075,0.862,0.064,18.0,20.0,31.0,9.0,2.0,33.0,51.0,19.0,6.0,3.0,5.0,5.0,3.0,2.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [4]:
df.shape

(5212, 142)

In [10]:
df.columns[60:100]

Index(['adv_count_norm', 'aux_count_norm', 'part_count_norm',
       'propn_count_norm', 'space_count_norm', 'intj_count_norm',
       'sym_count_norm', 'x_count_norm', 'the_w', 'to_w', 'and_w', 'of_w',
       'you_w', 'was_w', 'he_w', 'she_w', 'had_w', 'said_w', 'like_w',
       'just_w', 'time_w', 'know_w', 'think_w', 'got_w', 'right_w', 'adj_list',
       'adv_list', 'adj_str', 'adv_str', 'really_adv', 'back_adv',
       'always_adv', 'then_adv', 'more_adv', 'even_adv', 'why_adv',
       'maybe_adv', 'again_adv', 'now_adv', 'just_adv'],
      dtype='object')

### Check the baseline

In [6]:
baseline_acc = df['trans_code'].value_counts(normalize=True).max()
baseline_acc

0.39927091327705294

## 1. Features = Basic Counts + POS Counts + words + adj + adv
* choose predictor features - dropping the words 'had' and 'he'
* set predictor and target variables X, y
* perform train/test split
* normalise predictor variables
* gridsearch logistic regression
* score the fitted model
* save fitted model

In [11]:
# prepare modelling
model = LogisticRegression(solver='liblinear', multi_class='ovr', max_iter=1000)
lr = Modelling(df, model)

# select feature groups for modelling
predictor_cols = lr.feature_select(basic_counts=True, pos_counts=True, words=True, adj=True, adv=True)

# drop some individual features eg. those potentially related to page formatting
cols_to_drop = ['num_count_norm', 'punct_count_norm','space_count_norm', 'sym_count_norm', 'x_count_norm',
                'he_w', 'had_w']
predictor_cols = lr.drop_features(predictor_cols, cols_to_drop)

# prep data i.e. set X and y, train/test split, normalise predictor variables
X_train, X_test, y_train, y_test, idx_train, idx_test = lr.modelling_prep(predictor_cols, 'trans_code')

# gridsearch model parameters, score fitted model
params = {'C': [0.01, 0.1, 1, 10, 100],
          'penalty': ['l1', 'l2']}
# lreg_gs_01 = lr.gridsearch(model, params, X_train, y_train)
lr.gridsearch(params, X_train, y_train)

# score the fitted model
lreg_gs_01_results = lr.gridsearch_score(X_train, y_train, X_test, y_test)

# save fitted model
out_path = '../../../../Documents/murakami/pkl_models/'
out_name = 'lreg_gs_05_drop_features'
out_full_path = out_path + out_name + '.pkl'
lr.save_model(out_full_path)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   52.8s
[Parallel(n_jobs=2)]: Done  50 out of  50 | elapsed:   54.9s finished


Best Parameters:
{'C': 0.1, 'penalty': 'l1'}
Best estimator mean cross validated training score:
0.6485998716033177
Best estimator score on the full training set:
0.6749820100743583
Best estimator score on the test set:
0.6807286673058485
ROC-AUC score on the test set:
Class 0: 0.85
Class 1: 0.85
Class 2: 0.83


## Confusion Matrix

In [13]:
print(pd.DataFrame(lreg_gs_01_results['conmat'], index=['actual birnbaum', 'actual rubin', 'actual gabriel'],
                             columns=['predicted birnbaum', 'predicted rubin', 'predicted gabriel']))

                 predicted birnbaum  predicted rubin  predicted gabriel
actual birnbaum                 235               68                 42
actual rubin                     55              315                 46
actual gabriel                   56               66                160


## Classification Report

In [12]:
print(classification_report(y_test, lr.model_gs.predict(X_test)))

              precision    recall  f1-score   support

           0       0.68      0.68      0.68       345
           1       0.70      0.76      0.73       416
           2       0.65      0.57      0.60       282

    accuracy                           0.68      1043
   macro avg       0.68      0.67      0.67      1043
weighted avg       0.68      0.68      0.68      1043

