# Modeling 

In [None]:
from sklearn import model_selection, preprocessing, naive_bayes, metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from sklearn import decomposition, ensemble
import pandas as pd
import numpy as np

## Setting dataframes
- df_baseline is just numerical data
- just metadata dataframe (in the works)
- df_text is with vectorized description
    - this was already split into train and test sets in preprocessing
    - joined to df after splitting into train/test

In [None]:
# load in dataframe
df = pd.read_csv('csv_files/9_1_all_books_df.csv')

In [None]:
# this just includes pages and ratings along with the target of best seller or not
df_baseline = df[['rating', 'pages', 'best_seller']]

In [None]:
# this data is already split into the training set with a reset index
# so it will be concated with the df after spliting into train and test sets
df_text_X = pd.read_csv('csv_files/X_train_nlp.csv')
df_text_y= pd.read_csv('csv_files/y_train_nlp.csv', header=None)
df_text_X_test = pd.read_csv('csv_files/X_test_nlp.csv')
df_text_y_test = pd.read_csv('csv_files/y_test_nlp.csv', header=None)

In [None]:
df_text_X.tail()
# df_text_X.shape

In [None]:
df_text_y.tail()
# df_text_y.shape

In [None]:
df_text_X_test.head()
df_text_X_test.shape

In [None]:
df_text_y_test.head()
# df_text_y_test.shape
type(df_text_y_test)

# this is a series
type(df_text_y_test[0])

In [None]:
df_text_y.columns

In [None]:
# look at dataframe
print(df_baseline.columns)
print(df_baseline.shape)
df_baseline.head()

In [None]:
# split the dataset into training and validation datasets for just numerical data
X_train_rating, X_test_rating, y_train_rating, y_test_rating = model_selection.train_test_split(df_baseline[['rating', 'pages']], 
                                                                    df_baseline['best_seller'], 
                                                                    test_size = .2,
                                                                   random_state=42)

In [None]:
# this actually doesn't need to happen... the text data is already split...
# split the dataset with nlp and other features
X_train, X_test, y_train, y_test = model_selection.train_test_split(df[['rating', 'pages']], 
                                                                    df['best_seller'], 
                                                                    test_size = .2,
                                                                   random_state=42)

In [None]:
df_text = df_text_X 

In [None]:
df_text['_target'] = df_text_y[0]

In [None]:
df_text['_target'].dtype

In [None]:
df_text.head()

In [None]:
# deep vs shallow copies
df_text_X = df_text_X.loc[:, df_text_X.columns != '_target']

In [None]:
y_train_rating.tail()

## Numerical Baseline model
- On numerical data only

In [None]:
base = DummyClassifier(random_state=42)

In [None]:
base.fit(X_train_rating, y_train_rating)

In [None]:
print('mean training accuracy' ,base.score(X_train_rating, y_train_rating))
print('mean test accuracy', base.score(X_test_rating, y_test_rating))
# mean training accuracy 0.663023679417122
# mean test accuracy 0.6654545454545454

In [None]:
base_2 = DummyClassifier(strategy= 'most_frequent', random_state=42)

In [None]:
base_2.fit(X_train_rating, y_train_rating)

In [None]:
print('mean training accuracy' ,base_2.score(X_train_rating, y_train_rating))
print('mean test accuracy', base_2.score(X_test_rating, y_test_rating))
# mean training accuracy 0.7932604735883424
# mean test accuracy 0.7745454545454545

# Tree based Modeling

## Numerical Random Forest

In [None]:
random_forest = RandomForestClassifier()

In [None]:
# use X_train_rating, y_train_rating for numerical models
random_forest.fit(X_train_rating, y_train_rating)

In [None]:
random_meta_predictions = random_forest.predict(X_test_rating)

In [None]:
confusion_matrix(y_test_rating, random_meta_predictions)

In [None]:
random_forest.score(X_test_rating, y_test_rating)

In [None]:
random_forest.decision_path(X_train_rating)

In [None]:
random_forest.decision_path(X_test_rating)

In [None]:
random_forest.feature_importances_

## Text Modeling Random Forest
- only using text features

In [None]:
random_forest_text = RandomForestClassifier()

In [None]:
# use df_text_X, df_text_y[0] for text models
random_forest_text.fit(df_text_X, df_text_y[0])

In [None]:
# look at what features are important...
random_forest_text.feature_importances_
len(random_forest_text.feature_importances_)

In [None]:
max(random_forest_text.feature_importances_)

In [None]:
# random_forest_text.feature_importances_.where(max(random_forest_text.feature_importances_))
np.where(random_forest_text.feature_importances_ == max(random_forest_text.feature_importances_))
# this is 'prize'... maybe a little to telling

In [None]:
random_forest_text.feature_importances_[4091]

In [None]:
df_text_X.head()

In [None]:
# name of column
print(df_text_X.columns[4091])
# amount of times word appears
df_text_X.iloc[:,4091].sum()
# this is a word to remove...

In [None]:
# look at accuracy for training set
random_forest_text.score(df_text_X, df_text_y[0])

In [None]:
random_text_predictions = random_forest_text.predict(df_text_X_test)

In [None]:
confusion_matrix(df_text_y_test[0], random_text_predictions)

In [None]:
# look at accuracy for test set
random_forest_text.score(df_text_X_test, df_text_y_test)

Initially, the text model performs marginally better (> .1) on the test set that the meta data model, as seen in the subset accuracy for both models.

# Logistic Regression

## Numerical Logistic Regression

In [None]:
log  = LogisticRegression()

In [None]:
# use X_train_rating, y_train_rating for numerical models
log.fit(X_train_rating, y_train_rating)

In [None]:
log_predictions_training = log.predict(X_train_rating)

In [None]:
confusion_matrix(y_train_rating, log_predictions_training)

In [None]:
log.score(X_train_rating, y_train_rating)

In [None]:
log_predictions = log.predict(X_test_rating)

In [None]:
confusion_matrix(y_test_rating,log_predictions)

In [None]:
target_names = ['Not Bestseller', 'Bestseller']
print(classification_report(y_test_rating,log_predictions, target_names=target_names))


- The initial logistic model predicts the major class 98% of the time. 
- Will look optimizing for specificity by inspecting the true negative instances.




### Cross Validated  Numerical Logistic

## Text Modeling Logistic Regression

In [None]:
log_text = LogisticRegression()

In [None]:
# use df_text_X, df_text_y[0] for text models
log_text.fit(df_text_X, df_text_y[0])

In [None]:
log_predictions_text_training = log_text.predict(df_text_X)

In [None]:
confusion_matrix(df_text_y[0], log_predictions_text_training)

In [None]:
df_text_y[0].shape

In [None]:
log_text_confusion = plot_confusion_matrix(log_text, df_text_X, df_text_y[0], cmap='ocean')

In [None]:
log_text.score(df_text_X, df_text_y[0])

In [None]:
target_names = ['Not Bestseller', 'Bestseller']
print(classification_report(df_text_y[0], log_predictions_text_training, target_names=target_names))



Text features perform better than numerical features wrt to accuracy for logistic regression baseline models.

### Cross Validated Text Modeling Logistic

# things to do
- get the text test set... check
- find most important features....
- word counts...
- look at correctly predicted minority classes in baseline models
- add voting classifier? `from sklearn.ensemble import VotingClassifier`


In [None]:
import sklearn

In [None]:
sklearn.__version__