# Modeling 

In [89]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from sklearn import decomposition, ensemble
import pandas as pd
import numpy as np

## Setting dataframes
- df_baseline is just numerical data
- just metadata dataframe (in the works)
- df_text is with vectorized description
    - this was already split into train and test sets in preprocessing
    - joined to df after splitting into train/test

In [45]:
# load in dataframe
df = pd.read_csv('csv_files/9_1_all_books_df.csv')

In [46]:
# this just includes pages and ratings along with the target of best seller or not
df_baseline = df[['rating', 'pages', 'best_seller']]

In [47]:
# this data is already split into the training set with a reset index
# so it will be concated with the df after spliting into train and test sets
df_text_X = pd.read_csv('csv_files/X_train_nlp.csv')
df_text_y= pd.read_csv('csv_files/y_train_nlp.csv', header=None)
df_text_X_test = pd.read_csv('csv_files/X_test_nlp.csv')
df_text_y_test = pd.read_csv('csv_files/y_test_nlp.csv', header=None)

In [48]:
df_text_X.tail()
# df_text_X.shape

Unnamed: 0,aaron,ab,abandon,abandonment,abby,abc,abduct,abducted,ability,able,...,zelda,zest,zeus,zillion,zimmer,zoe,zone,zoom,zorie,zoroastrian
1093,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1094,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1095,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1096,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1097,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
df_text_y.tail()
# df_text_y.shape

Unnamed: 0,0
1093,1
1094,1
1095,1
1096,0
1097,1


In [50]:
df_text_X_test.head()
df_text_X_test.shape

(275, 5842)

In [51]:
df_text_y_test.head()
# df_text_y_test.shape
type(df_text_y_test)

# this is a series
type(df_text_y_test[0])

pandas.core.series.Series

In [52]:
df_text_y.columns

Int64Index([0], dtype='int64')

In [54]:
# look at dataframe
print(df_baseline.columns)
print(df_baseline.shape)
df_baseline.head()

Index(['rating', 'pages', 'best_seller'], dtype='object')
(1373, 3)


Unnamed: 0,rating,pages,best_seller
0,3.56,415,0
1,4.07,346,0
2,3.47,355,0
3,3.42,368,0
4,4.35,480,0


In [55]:
# split the dataset into training and validation datasets for just numerical data
X_train_rating, X_test_rating, y_train_rating, y_test_rating = model_selection.train_test_split(df_baseline[['rating', 'pages']], 
                                                                    df_baseline['best_seller'], 
                                                                    test_size = .2,
                                                                   random_state=42)

In [56]:
# this actually doesn't need to happen... the text data is already split...
# split the dataset with nlp and other features
X_train, X_test, y_train, y_test = model_selection.train_test_split(df[['rating', 'pages']], 
                                                                    df['best_seller'], 
                                                                    test_size = .2,
                                                                   random_state=42)

In [57]:
df_text = df_text_X 

In [58]:
df_text['_target'] = df_text_y[0]

In [59]:
df_text['_target'].dtype

dtype('int64')

In [60]:
df_text.head()

Unnamed: 0,aaron,ab,abandon,abandonment,abby,abc,abduct,abducted,ability,able,...,zest,zeus,zillion,zimmer,zoe,zone,zoom,zorie,zoroastrian,_target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
# deep vs shallow copies
df_text_X = df_text_X.loc[:, df_text_X.columns != '_target']

In [62]:
y_train_rating.tail()

1095    1
1130    1
1294    1
860     0
1126    1
Name: best_seller, dtype: int64

## Numerical Baseline model
- On numerical data only

In [63]:
base = DummyClassifier(random_state=42)

In [64]:
base.fit(X_train_rating, y_train_rating)

DummyClassifier(constant=None, random_state=42, strategy='stratified')

In [65]:
print('mean training accuracy' ,base.score(X_train_rating, y_train_rating))
print('mean test accuracy', base.score(X_test_rating, y_test_rating))
# mean training accuracy 0.663023679417122
# mean test accuracy 0.6654545454545454

mean training accuracy 0.663023679417122
mean test accuracy 0.6654545454545454


In [112]:
base_2 = DummyClassifier(strategy= 'most_frequent', random_state=42)

In [113]:
base_2.fit(X_train_rating, y_train_rating)

DummyClassifier(constant=None, random_state=42, strategy='most_frequent')

In [117]:
print('mean training accuracy' ,base_2.score(X_train_rating, y_train_rating))
print('mean test accuracy', base_2.score(X_test_rating, y_test_rating))
# mean training accuracy 0.7932604735883424
# mean test accuracy 0.7745454545454545

mean training accuracy 0.7932604735883424
mean test accuracy 0.7745454545454545


# Tree based Modeling

## Numerical Random Forest

In [66]:
random_forest = RandomForestClassifier()

In [67]:
# use X_train_rating, y_train_rating for numerical models
random_forest.fit(X_train_rating, y_train_rating)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [68]:
random_meta_predictions = random_forest.predict(X_test_rating)

In [69]:
confusion_matrix(y_test_rating, random_meta_predictions)

array([[199,  14],
       [ 45,  17]])

In [70]:
random_forest.score(X_test_rating, y_test_rating)

0.7854545454545454

In [71]:
random_forest.decision_path(X_train_rating)

(<1098x4400 sparse matrix of type '<class 'numpy.int64'>'
 	with 133408 stored elements in Compressed Sparse Row format>,
 array([   0,  443,  906, 1353, 1798, 2213, 2642, 3095, 3524, 3959, 4400]))

In [72]:
random_forest.decision_path(X_test_rating)

(<275x4400 sparse matrix of type '<class 'numpy.int64'>'
 	with 33063 stored elements in Compressed Sparse Row format>,
 array([   0,  443,  906, 1353, 1798, 2213, 2642, 3095, 3524, 3959, 4400]))

In [73]:
random_forest.feature_importances_

array([0.51598471, 0.48401529])

## Text Modeling Random Forest
- only using text features

In [74]:
random_forest_text = RandomForestClassifier()

In [102]:
# use df_text_X, df_text_y[0] for text models
random_forest_text.fit(df_text_X, df_text_y[0])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [103]:
# look at what features are important...
random_forest_text.feature_importances_
len(random_forest_text.feature_importances_)

5842

In [104]:
max(random_forest_text.feature_importances_)

0.012752379738969094

In [78]:
# random_forest_text.feature_importances_.where(max(random_forest_text.feature_importances_))
np.where(random_forest_text.feature_importances_ == max(random_forest_text.feature_importances_))
# this is 'prize'... maybe a little to telling

(array([4021]),)

In [79]:
random_forest_text.feature_importances_[4091]

0.01005868477116326

In [80]:
df_text_X.head()

Unnamed: 0,aaron,ab,abandon,abandonment,abby,abc,abduct,abducted,ability,able,...,zelda,zest,zeus,zillion,zimmer,zoe,zone,zoom,zorie,zoroastrian
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
# name of column
print(df_text_X.columns[4091])
# amount of times word appears
df_text_X.iloc[:,4091].sum()
# this is a word to remove...

prize


23

In [82]:
# look at accuracy for training set
random_forest_text.score(df_text_X, df_text_y[0])

0.9699453551912568

In [83]:
random_text_predictions = random_forest_text.predict(df_text_X_test)

In [84]:
confusion_matrix(df_text_y_test[0], random_text_predictions)

array([[203,  10],
       [ 47,  15]])

In [85]:
# look at accuracy for test set
random_forest_text.score(df_text_X_test, df_text_y_test)

0.7927272727272727

Initially, the text model performs marginally better (> .1) on the test set that the meta data model, as seen in the subset accuracy for both models.

# Logistic Regression

## Numerical Logistic Regression

In [90]:
log  = LogisticRegression()

In [92]:
# use X_train_rating, y_train_rating for numerical models
log.fit(X_train_rating, y_train_rating)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [96]:
log_predictions_training = log.predict(X_train_rating)

In [98]:
confusion_matrix(y_train_rating, log_predictions_training)

array([[866,   5],
       [218,   9]])

In [127]:
log.score(X_train_rating, y_train_rating)

0.7969034608378871

In [95]:
log_predictions = log.predict(X_test_rating)

In [97]:
confusion_matrix(y_test_rating,log_predections)

array([[213,   0],
       [ 58,   4]])

- The initial logistic model predicts the major class 98% of the time. 
- Will look optimizing for specificity by inspecting the true negative instances.




### Cross Validated  Numerical Logistic

## Text Modeling Logistic Regression

In [106]:
log_text = LogisticRegression()

In [107]:
# use df_text_X, df_text_y[0] for text models
log_text.fit(df_text_X, df_text_y[0])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [122]:
log_predictions_text_training = log_text.predict(df_text_X)

In [124]:
confusion_matrix(df_text_y[0], log_predictions_text_training)

array([[871,   0],
       [ 37, 190]])

In [126]:
log_text.score(df_text_X, df_text_y[0])

0.9663023679417122

Text features perform better than numerical features wrt to accuracy for logistic regression baseline models.

### Cross Validated Text Modeling Logistic

# things to do
- get the text test set... check
- find most important features....
- word counts...
- look at correctly predicted minority classes in baseline models


In [121]:
df_text_X.shape

(1098, 5842)

In [119]:
X_train_rating

Unnamed: 0,rating,pages
529,4.33,288
243,4.26,136
1310,4.38,720
664,4.43,672
745,3.60,352
1302,4.12,421
746,3.75,384
873,4.56,248
54,4.40,342
405,3.90,320
