In [1]:
import pandas as pd
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_combined = pd.read_csv("../CombinedBooksRandomized.csv")
df_combined.head()

Unnamed: 0,Title,Author,Author Average Rating,Author Votes,Review Count,Distinct Works,Year,Rating,Raters,Genres
0,The Left Hand of Darkness (Paperback),Ursula K. Le Guin,4.07,1258005,85585,753,1969,4.09,155037,"Science Fiction,Fiction,Fantasy,Classics,Scien..."
1,Wish You Were Here (Hardcover),Jodi Picoult,3.99,4447186,273761,82,2021,3.99,153115,"Fiction,Contemporary,Audiobook,Romance,Adult,A..."
2,If I Had Your Face (Hardcover),Frances Cha,3.76,41079,5356,4,2020,3.76,41078,"Fiction,Contemporary,Literary Fiction,Adult,Fe..."
3,I'll Be Gone in the Dark: One Woman's Obsessiv...,Michelle McNamara,4.12,210117,18190,2,2018,4.12,210038,"Nonfiction,Crime,Audiobook,Mystery,Mystery,His..."
4,1984 (Paperback),George Orwell,4.1,8007938,201434,729,1949,4.19,4034352,"Classics,Fiction,Science Fiction,Science Ficti..."


In [3]:
minRaters=min(df_combined.Raters)
maxRaters=max(df_combined.Raters)
meanRaters=df_combined.Raters.mean()
print('Min Raters :',minRaters)
print('Max Raters :',maxRaters)
print('Mean Raters :',meanRaters)

Min Raters : 41
Max Raters : 8958054
Mean Raters : 316506.9895833333


In [4]:
#rating class column
conditions  = [ df_combined.Rating >= 4, (df_combined.Rating < 4) & (df_combined.Rating> 2), (df_combined.Rating <= 2) ]
choices     = [ "good", 'mediocre', 'bad' ]
df_combined["rating_class"] = np.select(conditions, choices, default="bad")
#rater class column
conditions  = [ df_combined.Raters >= 25000, (df_combined.Raters < 25000) & (df_combined.Raters> 1000), (df_combined.Raters <= 1000) ]
choices     = [ "large", 'medium', 'small' ]
df_combined["raters_class"] = np.select(conditions, choices, default="small")

df_combined['worth_reading']=np.where(((df_combined.rating_class == "mediocre") & (df_combined.raters_class == "large"))|((df_combined.rating_class == "good") & ((df_combined['raters_class'] == "large")|(df_combined["raters_class"] =="medium"))) ,'yes','no')

In [5]:
df_combined.head()

Unnamed: 0,Title,Author,Author Average Rating,Author Votes,Review Count,Distinct Works,Year,Rating,Raters,Genres,rating_class,raters_class,worth_reading
0,The Left Hand of Darkness (Paperback),Ursula K. Le Guin,4.07,1258005,85585,753,1969,4.09,155037,"Science Fiction,Fiction,Fantasy,Classics,Scien...",good,large,yes
1,Wish You Were Here (Hardcover),Jodi Picoult,3.99,4447186,273761,82,2021,3.99,153115,"Fiction,Contemporary,Audiobook,Romance,Adult,A...",mediocre,large,yes
2,If I Had Your Face (Hardcover),Frances Cha,3.76,41079,5356,4,2020,3.76,41078,"Fiction,Contemporary,Literary Fiction,Adult,Fe...",mediocre,large,yes
3,I'll Be Gone in the Dark: One Woman's Obsessiv...,Michelle McNamara,4.12,210117,18190,2,2018,4.12,210038,"Nonfiction,Crime,Audiobook,Mystery,Mystery,His...",good,large,yes
4,1984 (Paperback),George Orwell,4.1,8007938,201434,729,1949,4.19,4034352,"Classics,Fiction,Science Fiction,Science Ficti...",good,large,yes


In [6]:
#split dataset in features and target variable
feature_cols=["Author Average Rating", "Author Votes", "Review Count", "Distinct Works", "Year"]
X = df_combined[feature_cols] # Features
y = df_combined['worth_reading'] # Target variable

print(X)
print(y)

      Author Average Rating  Author Votes  Review Count  Distinct Works  Year
0                      4.07       1258005         85585             753  1969
1                      3.99       4447186        273761              82  2021
2                      3.76         41079          5356               4  2020
3                      4.12        210117         18190               2  2018
4                      4.10       8007938        201434             729  1949
...                     ...           ...           ...             ...   ...
1723                   4.23        353862         48530              14  2017
1724                   3.87         15137          1503              60  2012
1725                   3.88       1385551        106047              51  2011
1726                   4.28        448442         25874              15  1943
1727                   3.58          6388           851               7  2005

[1728 rows x 5 columns]
0       yes
1       yes
2       yes
3  

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()




In [8]:
from sklearn.model_selection import GridSearchCV

k_range = list(range(1, 31))
parameter_grid = dict(n_neighbors=k_range)

grid_search = GridSearchCV(estimator=knn_model, cv = 3, param_grid= parameter_grid)
grid_search.fit(X_train, y_train)

In [9]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003333,0.000943,0.009,0.001414,1,{'n_neighbors': 1},0.860104,0.88342,0.909091,0.884205,0.020007,29
1,0.002666,0.000471,0.007333,0.000471,2,{'n_neighbors': 2},0.821244,0.839378,0.854545,0.838389,0.013613,30
2,0.002331,0.000474,0.008669,0.001697,3,{'n_neighbors': 3},0.904145,0.88342,0.919481,0.902348,0.014776,27
3,0.002,2e-06,0.006999,0.000816,4,{'n_neighbors': 4},0.901554,0.875648,0.883117,0.886773,0.010888,28
4,0.002,2e-06,0.007344,0.000462,5,{'n_neighbors': 5},0.914508,0.891192,0.922078,0.909259,0.013144,24
5,0.002,1e-06,0.007323,0.000501,6,{'n_neighbors': 6},0.904145,0.88601,0.922078,0.904078,0.014725,26
6,0.002003,5e-06,0.00733,0.000473,7,{'n_neighbors': 7},0.909326,0.896373,0.924675,0.910125,0.011568,21
7,0.002334,0.000473,0.007669,0.000467,8,{'n_neighbors': 8},0.914508,0.896373,0.919481,0.91012,0.009931,23
8,0.001657,0.000464,0.007341,0.000482,9,{'n_neighbors': 9},0.917098,0.893782,0.927273,0.912718,0.014019,19
9,0.002002,1e-06,0.007336,0.000471,10,{'n_neighbors': 10},0.92228,0.893782,0.92987,0.915311,0.015535,3


In [10]:
final_model = grid_search.best_params_
final_model

{'n_neighbors': 19}

From the best parameters found through the gridsearch, we tweak some more parameters to avoid overfitting.

In [14]:
knn_class = KNeighborsClassifier(n_neighbors= 19)
knn_class.fit(X_train, y_train)
predictions = knn_class.predict(X_test)
predictions

from sklearn.metrics import accuracy_score, confusion_matrix

print("Trained acc: ", accuracy_score(y_train, knn_class.predict(X_train)))
print("Test accuracy:", accuracy_score(y_test, predictions))
print("Confusion matrix train: ", "\n", confusion_matrix(y_train, knn_class.predict(X_train)))
print("Confusion matrix test: ", "\n",confusion_matrix(y_test, predictions))


Trained acc:  0.9178910976663786
Test accuracy: 0.9352014010507881
Confusion matrix train:  
 [[262  78]
 [ 17 800]]
Confusion matrix test:  
 [[158  27]
 [ 10 376]]


In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_train, knn_class.predict(X_train)))
print("---------------------------------------------------------")
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          no       0.94      0.77      0.85       340
         yes       0.91      0.98      0.94       817

    accuracy                           0.92      1157
   macro avg       0.93      0.87      0.90      1157
weighted avg       0.92      0.92      0.92      1157

---------------------------------------------------------
              precision    recall  f1-score   support

          no       0.94      0.85      0.90       185
         yes       0.93      0.97      0.95       386

    accuracy                           0.94       571
   macro avg       0.94      0.91      0.92       571
weighted avg       0.94      0.94      0.93       571

