# Modeling 

In [1]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn import decomposition, ensemble
import pandas as pd

  from numpy.core.umath_tests import inner1d


## Setting dataframes
- df_baseline is just numerical data
- just metadata dataframe (in the works)
- df_text is with vectorized description
    - this was already split into train and test sets to preprocess
    - going to be joined to df after splitting into train/test

In [2]:
# load in dataframe
df = pd.read_csv('csv_files/9_1_all_books_df.csv')

In [3]:
# this just includes pages and ratings along with the target of best seller or not
df_baseline = df[['rating', 'pages', 'best_seller']]

In [4]:
# this data is already split into the training set with a reset index
# so it will be concated with the df after spliting into train and test sets
df_text_x = df_text = pd.read_csv('csv_files/X_train_nlp.csv')


In [5]:
# look at dataframe
print(df_baseline.columns)
print(df_baseline.shape)
df_baseline.head()

Index(['rating', 'pages', 'best_seller'], dtype='object')
(1373, 3)


Unnamed: 0,rating,pages,best_seller
0,3.56,415,0
1,4.07,346,0
2,3.47,355,0
3,3.42,368,0
4,4.35,480,0


In [6]:
# split the dataset into training and validation datasets for just numerical data
X_train_rating, X_test_rating, y_train_rating, y_test_rating = model_selection.train_test_split(df_baseline[['rating', 'pages']], 
                                                                    df_baseline['best_seller'], 
                                                                    test_size = .2,
                                                                   random_state=42)

In [7]:
# split the dataset with nlp and other features
X_train, X_test, y_train, y_test = model_selection.train_test_split(df[['rating', 'pages']], 
                                                                    df['best_seller'], 
                                                                    test_size = .2,
                                                                   random_state=42)

In [8]:
y_train_rating.head()

529     0
243     0
1310    1
664     0
745     0
Name: best_seller, dtype: int64

## Numerical Baseline model
- On numerical data only

In [9]:
base = DummyClassifier(random_state=42)

In [10]:
base.fit(X_train_rating, y_train_rating)

DummyClassifier(constant=None, random_state=42, strategy='stratified')

In [11]:
print('mean training accuracy' ,base.score(X_train_rating, y_train_rating))
print('mean test accuracy', base.score(X_test_rating, y_test_rating))

mean training accuracy 0.663023679417122
mean test accuracy 0.6654545454545454


# Tree based Modeling

## Numerical Random Forest

In [12]:
random_forest = RandomForestClassifier()

In [13]:
random_forest.fit(X_train_rating, y_train_rating)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [14]:
random_forest.score(X_test_rating, y_test_rating)

0.7963636363636364

In [15]:
random_forest.decision_path(X_train_rating)

(<1098x4462 sparse matrix of type '<class 'numpy.int64'>'
 	with 134129 stored elements in Compressed Sparse Row format>,
 array([   0,  455,  898, 1329, 1800, 2237, 2672, 3095, 3512, 3987, 4462]))

In [16]:
random_forest.decision_path(X_test_rating)

(<275x4462 sparse matrix of type '<class 'numpy.int64'>'
 	with 33363 stored elements in Compressed Sparse Row format>,
 array([   0,  455,  898, 1329, 1800, 2237, 2672, 3095, 3512, 3987, 4462]))

In [17]:
random_forest.feature_importances_

array([0.45490538, 0.54509462])

# Logistic Regression