# Modeling 

In [43]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn import decomposition, ensemble
import pandas as pd
import numpy as np

## Setting dataframes
- df_baseline is just numerical data
- just metadata dataframe (in the works)
- df_text is with vectorized description
    - this was already split into train and test sets to preprocess
    - going to be joined to df after splitting into train/test

In [2]:
# load in dataframe
df = pd.read_csv('csv_files/9_1_all_books_df.csv')

In [3]:
# this just includes pages and ratings along with the target of best seller or not
df_baseline = df[['rating', 'pages', 'best_seller']]

In [4]:
# this data is already split into the training set with a reset index
# so it will be concated with the df after spliting into train and test sets
df_text_x = pd.read_csv('csv_files/X_train_nlp.csv')
df_text_y= pd.read_csv('csv_files/y_train_nlp.csv', header=None)


In [5]:
df_text_y.columns

Int64Index([0], dtype='int64')

In [6]:
# df_text_y[0]


In [7]:
# look at dataframe
print(df_baseline.columns)
print(df_baseline.shape)
df_baseline.head()

Index(['rating', 'pages', 'best_seller'], dtype='object')
(1373, 3)


Unnamed: 0,rating,pages,best_seller
0,3.56,415,0
1,4.07,346,0
2,3.47,355,0
3,3.42,368,0
4,4.35,480,0


In [8]:
# split the dataset into training and validation datasets for just numerical data
X_train_rating, X_test_rating, y_train_rating, y_test_rating = model_selection.train_test_split(df_baseline[['rating', 'pages']], 
                                                                    df_baseline['best_seller'], 
                                                                    test_size = .2,
                                                                   random_state=42)

In [9]:
# this actually doesn't need to happen... the text data is already split...
# split the dataset with nlp and other features
X_train, X_test, y_train, y_test = model_selection.train_test_split(df[['rating', 'pages']], 
                                                                    df['best_seller'], 
                                                                    test_size = .2,
                                                                   random_state=42)

In [10]:
df_text = df_text_x 

In [12]:
df_text['_target'] = df_text_y[0]

In [13]:
df_text['_target'].dtype

dtype('int64')

In [14]:
df_text.head()

Unnamed: 0,aaron,ab,abandon,abandonment,abby,abc,abduct,abducted,ability,able,...,zest,zeus,zillion,zimmer,zoe,zone,zoom,zorie,zoroastrian,_target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# deep vs shallow copies
df_text_x = df_text_x.loc[:, df_text_x.columns != '_target']

In [15]:
y_train_rating.tail()

1095    1
1130    1
1294    1
860     0
1126    1
Name: best_seller, dtype: int64

## Numerical Baseline model
- On numerical data only

In [16]:
base = DummyClassifier(random_state=42)

In [17]:
base.fit(X_train_rating, y_train_rating)

DummyClassifier(constant=None, random_state=42, strategy='stratified')

In [18]:
print('mean training accuracy' ,base.score(X_train_rating, y_train_rating))
print('mean test accuracy', base.score(X_test_rating, y_test_rating))
# mean training accuracy 0.663023679417122
# mean test accuracy 0.6654545454545454

mean training accuracy 0.663023679417122
mean test accuracy 0.6654545454545454


# Tree based Modeling

## Numerical Random Forest

In [19]:
random_forest = RandomForestClassifier()

In [20]:
random_forest.fit(X_train_rating, y_train_rating)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
random_forest.score(X_test_rating, y_test_rating)

0.7745454545454545

In [22]:
random_forest.decision_path(X_train_rating)

(<1098x4484 sparse matrix of type '<class 'numpy.int64'>'
 	with 137886 stored elements in Compressed Sparse Row format>,
 array([   0,  461,  902, 1385, 1838, 2303, 2750, 3167, 3624, 4079, 4484]))

In [23]:
random_forest.decision_path(X_test_rating)

(<275x4484 sparse matrix of type '<class 'numpy.int64'>'
 	with 34240 stored elements in Compressed Sparse Row format>,
 array([   0,  461,  902, 1385, 1838, 2303, 2750, 3167, 3624, 4079, 4484]))

In [24]:
random_forest.feature_importances_

array([0.47867717, 0.52132283])

# Logistic Regression

In [25]:
# log  = LogisticRegression()

# Text modeling

In [35]:
random_forest_text = RandomForestClassifier()

In [36]:
random_forest_text.fit(df_text_x, df_text_y[0])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
# look at what features are important...
random_forest_text.feature_importances_
len(random_forest_text.feature_importances_)

5842

In [38]:
max(random_forest_text.feature_importances_)

0.02222247784053615

In [45]:
# random_forest_text.feature_importances_.where(max(random_forest_text.feature_importances_))
np.where(random_forest_text.feature_importances_ == max(random_forest_text.feature_importances_))

(array([4091]),)

In [46]:
random_forest_text.feature_importances_[4091]

0.02222247784053615

In [53]:
df_text_x

Unnamed: 0,aaron,ab,abandon,abandonment,abby,abc,abduct,abducted,ability,able,...,zelda,zest,zeus,zillion,zimmer,zoe,zone,zoom,zorie,zoroastrian
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# df_text_x.iloc[:,4091]
df_text_x.columns[4091]
# 

'prize'

In [39]:
# look at accuracy for training set
random_forest_text.score(df_text_x, df_text_y[0])

0.9644808743169399

# things to do
- get the text test set...
- find most important features....
- word counts...
