In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import f1_score, accuracy_score, classification_report, precision_score, roc_curve, auc
import pprint

In [2]:
xy = pd.read_csv("cleaned_ebook_info.csv", converters={"array": eval})
xy.head()

Unnamed: 0,array,rating
0,"[[[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], ...",4.0
1,"[[[0, 80, 92], [0, 79, 92], [1, 79, 92], [3, 7...",4.0
2,"[[[52, 41, 71], [44, 34, 69], [43, 35, 72], [4...",4.0
3,"[[[179, 110, 68], [179, 111, 66], [179, 113, 6...",4.0
4,"[[[0, 37, 64], [0, 49, 76], [0, 65, 94], [0, 8...",4.0


In [3]:
xy["array"] = xy["array"].apply(lambda x: np.array(x))

In [4]:
lb = LabelBinarizer()
xy["Binary"] = lb.fit_transform(xy["rating"])
xy.head()

Unnamed: 0,array,rating,Binary
0,"[[[1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 1, 1], ...",4.0,0
1,"[[[0, 80, 92], [0, 79, 92], [1, 79, 92], [3, 7...",4.0,0
2,"[[[52, 41, 71], [44, 34, 69], [43, 35, 72], [4...",4.0,0
3,"[[[179, 110, 68], [179, 111, 66], [179, 113, 6...",4.0,0
4,"[[[0, 37, 64], [0, 49, 76], [0, 65, 94], [0, 8...",4.0,0


In [5]:
X = []
Y = []
for array, rating in zip(xy["array"], xy["rating"]):
    X.append(array.tolist())
    Y.append(rating)

In [6]:
X = np.array(X)
Y = np.array(Y).astype(int)
print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: (1838, 218, 145, 3)
Y shape: (1838,)


In [7]:
X = X.reshape((X.shape[0], X.shape[1] * X.shape[2] * X.shape[3]))
print("X shape:", X.shape)

X shape: (1838, 94830)


In [8]:
def grid_cv(X, Y, kernel, C, folds):
    grid = {"C": [], "precision": []}
    
    for i in range(len(C)):
        svm = SVC(C=C[i], kernel=kernel)
        cv = cross_val_score(svm, X, Y, cv=folds, scoring="precision")
        grid["C"].append(C[i])
        grid["precision"].append(np.mean(cv))
        
    return grid

In [9]:
C = [0.01, 0.1, 1, 10, 100]
svm_rbf = grid_cv(X, xy["Binary"], kernel="rbf", C=C, folds=5)
svm_linear = grid_cv(X, xy["Binary"], kernel="linear", C=C, folds=5)
svm_poly = grid_cv(X, xy["Binary"], kernel="poly", C=C, folds=5)

In [10]:
print(svm_rbf)
print(svm_linear)
print(svm_poly)

{'C': [0.01, 0.1, 1, 10, 100], 'precision': [0.5233947399597204, 0.5233947399597204, 0.5918146588324253, 0.6059401517780542, 0.6071849544717256]}
{'C': [0.01, 0.1, 1, 10, 100], 'precision': [0.6026985767119869, 0.6026985767119869, 0.6026985767119869, 0.6026985767119869, 0.6026985767119869]}
{'C': [0.01, 0.1, 1, 10, 100], 'precision': [0.5238689255350335, 0.5593756947721744, 0.6005538847117794, 0.6014603405735549, 0.5977619047619047]}


In [11]:
LR_clf = LogisticRegression(solver="liblinear")
LR_cv = cross_val_score(LR_clf, X, xy["Binary"], cv=5, scoring="precision")

In [13]:
print(np.mean(LR_cv))

0.6015160873629204


In [23]:
print("Best SVM Precision:", max(max(svm_rbf["precision"]),
                                 max(svm_linear["precision"]),
                                 max(svm_poly["precision"])))

print("Logistic Regression Precision:", np.mean(LR_cv))


Best SVM Precision: 0.6071849544717256
Logistic Regression Precision: 0.6015160873629204
