In [1]:
# import libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [11]:
# Read in the training dataset
X_train = pd.read_csv('data/X_train_df_final.csv', index_col=0)
y_train = pd.read_csv('data/y_train.csv', index_col=0)

# Read in the testing dataset
X_test = pd.read_csv('data/X_test_df_final.csv', index_col=0)
y_test = pd.read_csv('data/y_test.csv', index_col=0)

##### Sanity check training dataset

In [7]:
X_train.head()

Unnamed: 0,pages,numRatings,bbeScore,year,month,series_bool,genre_count,Adult,Adventure,Audiobook,...,Other,Paranormal,Romance,Science Fiction,Science Fiction Fantasy,Supernatural,Suspense,Thriller,Urban Fantasy,Young Adult
35783,22.0,666,89,1974,1,0,52,0,0,0,...,1,0,0,0,0,0,0,0,0,0
39773,176.0,161,83,1983,1,0,95,0,0,0,...,1,0,0,0,0,0,0,0,0,0
51074,339.0,2535,24,2014,5,1,171,0,0,0,...,1,0,1,0,0,0,0,0,0,1
7233,201.0,17367,370,2009,7,0,139,0,0,0,...,1,0,0,1,0,0,0,0,0,0
6868,224.0,94434,389,2016,11,0,125,1,0,1,...,1,0,0,0,0,0,0,0,0,0


In [9]:
X_train.shape

(33041, 38)

In [12]:
y_train.head()

Unnamed: 0,rating
35783,1
39773,0
51074,0
7233,1
6868,0


In [13]:
y_train.shape

(33041, 1)

##### Sanity check testing dataset

In [14]:
X_test.head()

Unnamed: 0,pages,numRatings,bbeScore,year,month,series_bool,genre_count,Adult,Adventure,Audiobook,...,Other,Paranormal,Romance,Science Fiction,Science Fiction Fantasy,Supernatural,Suspense,Thriller,Urban Fantasy,Young Adult
33489,346.0,1864,92,2015,9,1,147,0,0,0,...,1,0,0,0,0,0,0,0,0,0
12086,160.0,12646,194,2002,1,1,140,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8714,513.0,62049,289,2019,9,1,131,0,0,0,...,1,0,0,0,0,0,0,0,0,0
36527,432.0,3347,88,2012,3,0,127,0,0,0,...,1,0,0,0,0,0,0,0,0,0
12745,345.0,115,190,2014,6,1,65,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [15]:
X_test.shape

(8261, 38)

In [16]:
y_test.head()

Unnamed: 0,rating
33489,1
12086,0
8714,1
36527,1
12745,0


In [17]:
y_test.shape

(8261, 1)

<a id = 'base_model'></a>
## Baseline Modeling

The baseline will be a logistic regression model. This will be used for future testing in how other models perform.

The model will predict whether a book has a good rating (1) or a bad rating (0).<a id = 'base_model'></a>


##### Scale the data

Since the majority of columns are binary, only the non-binary columns will be scaled. 
First, seperate the binary from non-binary columns and then apply the standard scaler to only the non-binary columns. 

In [21]:
from sklearn.preprocessing import StandardScaler

# Identify non-binary columns
# non_binary_cols = [col for col in X_train_df_final.columns if len(X_train_df_final[col].unique()) > 2]
non_binary_cols = ['pages', 'numRatings', 'bbeScore', 'year', 'month']

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale only non-binary columns
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[non_binary_cols] = scaler.fit_transform(X_train[non_binary_cols])
X_test_scaled[non_binary_cols] = scaler.transform(X_test[non_binary_cols])

In [22]:
# Import logisitc regression
from sklearn.linear_model import LogisticRegression

# Instantiate and fit logisitc regression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

# Train and test scores
print(f'Train score: {logreg.score(X_train_scaled, y_train)}')
print(f'Test score: {logreg.score(X_test_scaled, y_test)}')

  y = column_or_1d(y, warn=True)


Train score: 0.6415665385430223
Test score: 0.517491829076383


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


The baseline logisitc regression has a train score of 64 percent and a test score of 52 percent. There seems to be overfitting and the overall test accuracy is not strong. This will still be used as a baseline and hopefully testing out different models will improve the prediction accuracy for the test set.

In [26]:
# Extract the coefficients
coefficients = logreg.coef_[0]

# Calculate the odds ratios
odds_ratio = np.exp(coefficients)

# Get the feature names
feature_names = X_train.columns

# Create a DataFrame with feature names and their corresponding coefficients and odds ratios
feature_coefficients = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Odds_ratio': odds_ratio
})

# Sort the coefficients from highest to lowest
feature_coefficients = feature_coefficients.sort_values(by='Coefficient', ascending=False)

# Display 
feature_coefficients

Unnamed: 0,Feature,Coefficient,Odds_ratio
5,series_bool,0.833176,2.300613
0,pages,0.489714,1.631849
11,Childrens,0.449391,1.567357
12,Classics,0.398184,1.489119
14,Contemporary Romance,0.30178,1.352264
34,Suspense,0.192391,1.212144
28,Other,0.185002,1.20322
1,numRatings,0.137388,1.147273
3,year,0.103011,1.108504
26,Nonfiction,0.057649,1.059343


Highest odds ratio:
- The series columns has the highest coefficient of around 0.83. If the book is in a series, it has 2.3 times better odds to have a good review than if it's not in a series.
- From the genres columns, Childrens, Classocs and Contemporary Romance have the highest odds ratios.

Lowest odds ratio:
- From the genres columns, Chick Lit has the highest negative correlation of around -0.71 and Mystery being second with around -0.62

Notes:
- In future, the genres could be filtered through a bit more. For example, in the top 30 genres, there is Historical, Historical Fiction, and History. These could be put into a single category to allow for other genres.

##### Confusion Matrix

##### Model Summary

|    | Model Type          | Train Accuracy | Test Accuracy | Parameters | Notes    |
|----|---------------------|----------------|---------------|------------|----------|
| 1. | Logistic Regression | 62 %           | 52 %          | None       | Baseline |

## Decision Tree Classifier

##### Baseline DTC Model

In [28]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

print(f'Train score: {dtc.score(X_train, y_train)}')
print(f'Test score: {dtc.score(X_test, y_test)}')

Train score: 1.0
Test score: 0.5322600169471008


The baseline model is over fitting a lot, with a gap of almost 50 percent. The next step is to use hyper parameters to see if we can obtain a better test score.

##### Model Summary

|    | Model Type               | Train Accuracy | Test Accuracy | Parameters | Notes    |
|----|--------------------------|----------------|---------------|------------|----------|
| 1. | Logistic Regression      | 62 %           | 52 %          | None       | Baseline |
| 2. | Decision Tree Classifier | 100 %          | 53 %          | None       | Baseline |

In [None]:
# Create a pipeline
estimators = [ 
    ('PCA', PCA(n_components=20)),
    ('dt_model', DecisionTreeClassifier())
]

pipe = Pipeline(estimators)

In [None]:
# Create parameters
params = {'dt_model__max_depth': [3, 6, 9],
          'dt_model__min_samples_leaf' : [3, 6, 9],
          'dt_model__criterion': ['gini', 'entropy'],
         }

grid_search = GridSearchCV(pipe, param_grid=params, cv=5)

In [None]:
# Scoring the test set
pipe.fit(X_train_df, y_train)

print(f'Test score: {pipe.score(X_test_df, y_test)}')

In [None]:
# Fit the pipeline with the best parameters on the entire training set
grid = GridSearchCV(pipe, params, cv=5)

fittedgrid = grid_search.fit(X_train_df, y_train)

## Random Forest

##### Parameter Optimization

##### Confusion Matrix

##### Classification Report

Precision

Recall

## XG Boost

##### Parameter Optimization

##### Confusion Matrix

##### Classification Report

Precision

Recall

## 

Actionable insights:
- Which genres have highest coefficients? 

Future Ideas:
- Count vectorizing the description - interesting key words
    

Other:
- I