# <span style="color:red"> Main Modelling - Simple Neural Network - Gridsearch </span>
* read in pickle
* keep 3 translators
* final selected features based on initial analysis with logistic regression
* drop 'A Wild Sheep Chase' due to unique text
* run XGboosting

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import textacy
import re
import pickle
import os

from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, classification_report, average_precision_score, roc_auc_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn import datasets
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.preprocessing import label_binarize
from sklearn.neural_network import MLPClassifier

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# import class to run modelling steps
from translator_modelling import Modelling

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# extend limit of number of rows and columns to display in cell
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

### Load the dataframe containing text chunks and related variables

In [3]:
in_full_path = '../../../../Documents/murakami/pkl3/df_all_v02.pkl'
# read back pickle
with open (in_full_path, 'rb') as fp:
    df = pickle.load(fp)

### Check the dataframe is as expected

In [4]:
df.head(1)

Unnamed: 0,book_chunk_no,number,title,text,fstop_indices,split_indices,chunks,translator,book_title,parsed,n_sents,n_words,n_chars,n_syllables,n_unique_words,n_long_words,n_monosyllable_words,n_polysyllable_words,trans_code,chunk_length,n_sents_norm,n_words_norm,n_chars_norm,n_syllables_norm,n_unique_words_norm,n_long_words_norm,n_monosyllable_words_norm,n_polysyllable_words_norm,vader_compound,vader_neg,vader_neu,vader_pos,pron_count,verb_count,det_count,adj_count,num_count,punct_count,noun_count,adp_count,cconj_count,sconj_count,adv_count,aux_count,part_count,propn_count,space_count,intj_count,sym_count,x_count,...,then_adv,more_adv,even_adv,why_adv,maybe_adv,again_adv,now_adv,just_adv,how_adv,where_adv,very_adv,only_adv,there_adv,still_adv,so_adv,too_adv,when_adv,all_adv,here_adv,never_adv,as_adv,new_adj,other_adj,more_adj,small_adj,deep_adj,whole_adj,first_adj,bad_adj,little_adj,next_adj,much_adj,own_adj,hard_adj,last_adj,only_adj,big_adj,right_adj,long_adj,old_adj,strange_adj,same_adj,young_adj,sure_adj,able_adj,real_adj,different_adj,good_adj,few_adj,vlong_words_count
0,0,1,Wednesday Afternoon Picnic,IT WAS A short one-paragraph item in the morn...,"[57, 97, 115, 196, 318, 385, 420, 445, 504, 65...","[967, 1924, 2998, 3982, 4935, 5975, 6995, 7961...",IT WAS A short one-paragraph item in the morni...,Alfred Birnbaum,A Wild Sheep Chase,"(IT, WAS, A, short, one, -, paragraph, item, i...",15,174,742,240,116,33,128,17,0,944,15.889831,184.322034,786.016949,254.237288,122.881356,34.957627,135.59322,18.008475,-0.4798,0.075,0.862,0.064,18.0,20.0,31.0,9.0,2.0,33.0,51.0,19.0,6.0,3.0,5.0,5.0,3.0,2.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [5]:
df.shape

(5212, 142)

### Check the baseline

In [6]:
baseline_acc = df['trans_code'].value_counts(normalize=True).max()
baseline_acc

0.39927091327705294

In [7]:
# prepare modelling
model = MLPClassifier(random_state=42, max_iter=500)
mlp = Modelling(df, model)

# select feature groups for modelling
predictor_cols = mlp.feature_select(basic_counts=True, vader=False, pos_counts=True, 
                   words=True, adv=True, adj=True)

# drop some individual features eg. those potentially related to page formatting
cols_to_drop = ['num_count_norm', 'punct_count_norm','space_count_norm', 'sym_count_norm', 'x_count_norm']
predictor_cols = mlp.drop_features(predictor_cols, cols_to_drop)

# prep data i.e. set X and y, train/test split, normalise predictor variables
X_train, X_test, y_train, y_test, idx_train, idx_test = mlp.modelling_prep(predictor_cols, 'trans_code')

# gridsearch model parameters, score fitted model
params = {'hidden_layer_sizes': [(8, 8, 8), (8, 8, 8, 8, 8)],
          'alpha': [0.01, 0.1, 1, 10, 100],
          'solver': ['lbfgs', 'sgd', 'adam'],
          'activation': ['logistic', 'tanh','relu'],
          'batch_size': [10, 20, 50]}

mlp.gridsearch(params, X_train, y_train)

# score the fitted model
mlp_gs_01_results = mlp.gridsearch_score(X_train, y_train, X_test, y_test)

# save fitted model
out_path = '../../../../Documents/murakami/pkl_models/'
out_name = 'mlp_gs_01'
out_full_path = out_path + out_name + '.pkl'
mlp.save_model(out_full_path)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  5.4min
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed: 13.2min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed: 17.0min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed: 44.3min
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed: 75.5min
[Parallel(n_jobs=2)]: Done 1350 out of 1350 | elapsed: 77.3min finished


Best Parameters:
{'activation': 'tanh', 'alpha': 100, 'batch_size': 10, 'hidden_layer_sizes': (8, 8, 8), 'solver': 'lbfgs'}
Best estimator mean cross validated training score:
0.6884106160449791
Best estimator score on the full training set:
0.7699688174622211
Best estimator score on the test set:
0.7123681687440077
ROC-AUC score on the test set:
Class 0: 0.86
Class 1: 0.87
Class 2: 0.88


### Confusion Matrix

In [None]:
predictions = mlp_gs_01.predict(X_test)

In [None]:
conmat = confusion_matrix(
    y_test, predictions, labels=[0, 1, 2])

confusion = pd.DataFrame(conmat, index=['actual birnbaum', 'actual rubin', 'actual gabriel'],
                         columns=['predicted birnbaum', 'predicted rubin', 'predicted gabriel'])
confusion

* OK results

### Classification Report

In [None]:
print(classification_report(y_test, predictions))

### Save the models

In [None]:
out_path = '../../../Documents/murakami/pkl_models/'
out_name = 'svm_gs_01'
out_full_path = out_path + out_name + '.pkl'

In [None]:
# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(svm_gs_01, fp)
    
# read back pickle
with open (out_full_path, 'rb') as fp:
    svm_gs_01_read = pickle.load(fp)

In [None]:
svm_gs_01_read.best_params_

In [None]:
out_path = '../../../Documents/murakami/pkl_models/'
out_name = 'svm_gs_02'
out_full_path = out_path + out_name + '.pkl'

In [None]:
# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(svm_gs_02, fp)
    
# read back pickle
with open (out_full_path, 'rb') as fp:
    svm_gs_02_read = pickle.load(fp)

In [None]:
out_path = '../../../Documents/murakami/pkl_models/'
out_name = 'svm_gs_03'
out_full_path = out_path + out_name + '.pkl'

In [None]:
# save pickle
with open(out_full_path, 'wb') as fp:
    pickle.dump(svm_gs_03, fp)
    
# read back pickle
with open (out_full_path, 'rb') as fp:
    svm_gs_03_read = pickle.load(fp)