In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
df_combined = pd.read_csv("../CombinedBooksRandomized.csv")
df_combined.tail()

Unnamed: 0,Title,Author,Author Average Rating,Author Votes,Review Count,Distinct Works,Year,Rating,Raters,Genres
1723,The Bear and the Nightingale (The Winternight ...,Katherine Arden,4.23,353862,48530,14,2017,4.1,170787,"Fantasy,Historical,Fiction,Young Adult,Histori..."
1724,Only Superhuman,Christopher L. Bennett,3.87,15137,1503,60,2012,2.86,389,"Science Fiction,Fiction,Comics,Fantasy,Sequent..."
1725,"Delirium (Delirium, #1)",Lauren Oliver,3.88,1385551,106047,51,2011,3.96,452659,"Young Adult,Science Fiction,Romance,Science Fi..."
1726,A Tree Grows in Brooklyn (Paperback),Betty Smith,4.28,448442,25874,15,1943,4.29,434302,"Classics,Fiction,Historical,Young Adult,Young ..."
1727,The Third Translation,Matt Bondurant,3.58,6388,851,7,2005,2.58,474,"Fiction,Mystery,Thriller,Northern Africa,Histo..."


In [3]:
minRaters=min(df_combined.Raters)
maxRaters=max(df_combined.Raters)
meanRaters=df_combined.Raters.mean()
print('Min Raters :',minRaters)
print('Max Raters :',maxRaters)
print('Mean Raters :',meanRaters)

Min Raters : 41
Max Raters : 8958054
Mean Raters : 316506.9895833333


In [4]:
#rating class column
conditions  = [ df_combined.Rating >= 4, (df_combined.Rating < 4) & (df_combined.Rating > 2), (df_combined.Rating <= 2) ]
choices     = [ "good", 'above average', 'bad' ]
df_combined["rating_class"] = np.select(conditions, choices, default="bad")
#rater class column
conditions  = [ df_combined.Raters >= 25000, (df_combined.Raters < 25000) & (df_combined.Raters> 1000), (df_combined.Raters <= 1000) ]
choices     = [ "large", 'medium', 'small' ]
df_combined["raters_class"] = np.select(conditions, choices, default="small")

df_combined['worth_reading']=np.where(((df_combined.rating_class == "above average") & (df_combined.raters_class == "large"))|((df_combined.rating_class == "good") & ((df_combined['raters_class'] == "large")|(df_combined["raters_class"] =="medium"))) ,'yes','no')

In [5]:
df_combined.head()

Unnamed: 0,Title,Author,Author Average Rating,Author Votes,Review Count,Distinct Works,Year,Rating,Raters,Genres,rating_class,raters_class,worth_reading
0,The Left Hand of Darkness (Paperback),Ursula K. Le Guin,4.07,1258005,85585,753,1969,4.09,155037,"Science Fiction,Fiction,Fantasy,Classics,Scien...",good,large,yes
1,Wish You Were Here (Hardcover),Jodi Picoult,3.99,4447186,273761,82,2021,3.99,153115,"Fiction,Contemporary,Audiobook,Romance,Adult,A...",above average,large,yes
2,If I Had Your Face (Hardcover),Frances Cha,3.76,41079,5356,4,2020,3.76,41078,"Fiction,Contemporary,Literary Fiction,Adult,Fe...",above average,large,yes
3,I'll Be Gone in the Dark: One Woman's Obsessiv...,Michelle McNamara,4.12,210117,18190,2,2018,4.12,210038,"Nonfiction,Crime,Audiobook,Mystery,Mystery,His...",good,large,yes
4,1984 (Paperback),George Orwell,4.1,8007938,201434,729,1949,4.19,4034352,"Classics,Fiction,Science Fiction,Science Ficti...",good,large,yes


In [6]:
#split dataset in features and target variable
feature_cols=["Author Average Rating", "Author Votes", "Review Count", "Distinct Works", "Year"]
X = df_combined[feature_cols] # Features
y = df_combined['worth_reading'] # Target variable

print(X)
print(y)

      Author Average Rating  Author Votes  Review Count  Distinct Works  Year
0                      4.07       1258005         85585             753  1969
1                      3.99       4447186        273761              82  2021
2                      3.76         41079          5356               4  2020
3                      4.12        210117         18190               2  2018
4                      4.10       8007938        201434             729  1949
...                     ...           ...           ...             ...   ...
1723                   4.23        353862         48530              14  2017
1724                   3.87         15137          1503              60  2012
1725                   3.88       1385551        106047              51  2011
1726                   4.28        448442         25874              15  1943
1727                   3.58          6388           851               7  2005

[1728 rows x 5 columns]
0       yes
1       yes
2       yes
3  

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)


In [47]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [48]:
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV


logr_model =linear_model.LogisticRegression(random_state = 42)
logr_model.fit(X_train,y_train)

parameter_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    "solver" : ['newton-cg', 'lbfgs', 'liblinear'],
    "penalty" : ['l2']

}

clf = GridSearchCV(logr_model, param_grid= parameter_grid, cv = 10)
clf.fit(X_train, y_train)


In [49]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_penalty,param_solver,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004208,0.000749,0.000395,0.000484,0.01,l2,newton-cg,"{'C': 0.01, 'penalty': 'l2', 'solver': 'newton...",0.905172,0.87069,...,0.87931,0.844828,0.844828,0.862069,0.878261,0.895652,0.834783,0.872939,0.025402,14
1,0.002198,0.000397,0.0006,0.00049,0.01,l2,lbfgs,"{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}",0.905172,0.87069,...,0.87931,0.844828,0.844828,0.862069,0.878261,0.895652,0.834783,0.872939,0.025402,14
2,0.001199,0.000401,0.000299,0.000456,0.01,l2,liblinear,"{'C': 0.01, 'penalty': 'l2', 'solver': 'liblin...",0.913793,0.887931,...,0.896552,0.887931,0.836207,0.896552,0.895652,0.913043,0.878261,0.892834,0.022859,13
3,0.004999,6e-06,0.000501,0.000501,0.1,l2,newton-cg,"{'C': 0.1, 'penalty': 'l2', 'solver': 'newton-...",0.922414,0.887931,...,0.896552,0.887931,0.827586,0.905172,0.913043,0.913043,0.878261,0.896297,0.027776,10
4,0.0033,0.000642,0.000301,0.00046,0.1,l2,lbfgs,"{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}",0.922414,0.887931,...,0.896552,0.887931,0.827586,0.905172,0.913043,0.913043,0.878261,0.896297,0.027776,10
5,0.001901,0.000539,0.0002,0.0004,0.1,l2,liblinear,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.913793,0.887931,...,0.896552,0.887931,0.827586,0.905172,0.913043,0.913043,0.878261,0.894573,0.026047,12
6,0.005102,0.001042,0.000299,0.000457,1.0,l2,newton-cg,"{'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}",0.913793,0.887931,...,0.905172,0.913793,0.87069,0.913793,0.921739,0.913043,0.878261,0.901477,0.016455,7
7,0.003703,0.000461,9.9e-05,0.000297,1.0,l2,lbfgs,"{'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}",0.913793,0.887931,...,0.905172,0.913793,0.87069,0.913793,0.921739,0.913043,0.878261,0.901477,0.016455,7
8,0.0017,0.000458,0.0002,0.0004,1.0,l2,liblinear,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",0.913793,0.887931,...,0.905172,0.913793,0.862069,0.913793,0.921739,0.913043,0.878261,0.901477,0.018172,7
9,0.004901,0.000701,0.000498,0.000498,10.0,l2,newton-cg,"{'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}",0.913793,0.887931,...,0.905172,0.913793,0.87931,0.913793,0.921739,0.913043,0.878261,0.902339,0.014979,1


In [50]:
clf.best_estimator_

In [51]:
logr_class = linear_model.LogisticRegression(C=10, random_state=42, solver='newton-cg')
logr_class.fit(X_train, y_train)
predictions = logr_class.predict(X_test)
predictions

from sklearn.metrics import accuracy_score, confusion_matrix

print("Trained acc: ", accuracy_score(y_train, logr_class.predict(X_train)))
print("Test accuracy:", accuracy_score(y_test, predictions))
print("Confusion matrix train: ", "\n", confusion_matrix(y_train, logr_class.predict(X_train)))
print("Confusion matrix test: ", "\n",confusion_matrix(y_test, predictions))


Trained acc:  0.9006050129645635
Test accuracy: 0.9124343257443083
Confusion matrix train:  
 [[253  87]
 [ 28 789]]
Confusion matrix test:  
 [[147  38]
 [ 12 374]]


In [52]:
from sklearn.metrics import classification_report
print(classification_report(y_train, logr_class.predict(X_train)))
print("---------------------------------------------------------")
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          no       0.90      0.74      0.81       340
         yes       0.90      0.97      0.93       817

    accuracy                           0.90      1157
   macro avg       0.90      0.85      0.87      1157
weighted avg       0.90      0.90      0.90      1157

---------------------------------------------------------
              precision    recall  f1-score   support

          no       0.92      0.79      0.85       185
         yes       0.91      0.97      0.94       386

    accuracy                           0.91       571
   macro avg       0.92      0.88      0.90       571
weighted avg       0.91      0.91      0.91       571



In [53]:
print(y.ravel())
print(predictions.ravel())

['yes' 'yes' 'yes' ... 'yes' 'yes' 'no']
['no' 'yes' 'yes' 'yes' 'yes' 'no' 'yes' 'yes' 'yes' 'yes' 'yes' 'yes'
 'yes' 'yes' 'yes' 'yes' 'no' 'yes' 'yes' 'yes' 'no' 'yes' 'yes' 'yes'
 'yes' 'yes' 'yes' 'yes' 'yes' 'yes' 'yes' 'yes' 'yes' 'no' 'no' 'yes'
 'yes' 'yes' 'yes' 'no' 'yes' 'yes' 'yes' 'yes' 'no' 'no' 'yes' 'no' 'yes'
 'yes' 'yes' 'no' 'yes' 'yes' 'yes' 'no' 'yes' 'yes' 'yes' 'yes' 'no' 'no'
 'yes' 'no' 'no' 'no' 'yes' 'no' 'no' 'yes' 'yes' 'yes' 'yes' 'yes' 'yes'
 'yes' 'yes' 'no' 'yes' 'no' 'yes' 'no' 'no' 'yes' 'yes' 'no' 'yes' 'yes'
 'no' 'no' 'yes' 'yes' 'no' 'yes' 'yes' 'yes' 'yes' 'yes' 'yes' 'no' 'yes'
 'yes' 'yes' 'no' 'yes' 'yes' 'no' 'no' 'no' 'yes' 'no' 'yes' 'yes' 'yes'
 'yes' 'yes' 'no' 'yes' 'no' 'no' 'yes' 'no' 'yes' 'yes' 'yes' 'yes' 'yes'
 'yes' 'yes' 'yes' 'yes' 'yes' 'yes' 'no' 'no' 'yes' 'no' 'yes' 'yes'
 'yes' 'yes' 'no' 'yes' 'yes' 'yes' 'no' 'yes' 'yes' 'yes' 'yes' 'yes'
 'yes' 'yes' 'no' 'no' 'yes' 'no' 'no' 'yes' 'no' 'no' 'yes' 'yes' 'yes'
 'yes' 'ye