In [18]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [19]:
#Load and clean data
df = pd.read_csv('base_df.csv')

#Handling Price Outliers in Price Cols
df_cont = pd.read_csv("con_dqr_df.csv")
minp = df_cont.loc[df_cont['Feature'] == "MinPrice"]
maxp = df_cont.loc[df_cont["Feature"] == "MaxPrice"]
avgp = df_cont.loc[df_cont["Feature"] == "AvgPrice"]
miniqr = 1.5 * (minp["Q3"] - minp["Q1"]).values[0]
maxiqr = 1.5 * (maxp["Q3"] - maxp["Q1"]).values[0]
avgiqr = 1.5 * (avgp["Q3"] - avgp["Q1"]).values[0]
df = df[(df["MinPrice"] >= minp["Q1"].values[0] - miniqr) | (df["MinPrice"] <= minp["Q3"].values[0] + miniqr)]
df = df[(df["MaxPrice"] >= maxp["Q1"].values[0] - maxiqr) | (df["MaxPrice"] <= maxp["Q3"].values[0] + maxiqr)]
df = df[(df["AvgPrice"] >= avgp["Q1"].values[0] - avgiqr) | (df["MinPrice"] <= avgp["Q3"].values[0] + avgiqr)]
df = df.dropna()
encode = pd.get_dummies(df[["Country", "Location", "Cuisine1", "Cuisine2"]])

#Create Price Cols in $50 steps
x = 50
while x <= df["MinPrice"].max():
    df["MinPrice < " + str(x)] = np.where(df['MinPrice'] < x, 1, 0)
    x = x + 50
x = 50
while x <= df["MaxPrice"].max():
    df["MaxPrice < " + str(x)] = np.where(df['MaxPrice'] < x, 1, 0)
    x = x + 50
x - 50
while x <= df["AvgPrice"].max():
    df["AvgPrice < " + str(x)] = np.where(df['AvgPrice'] < x, 1, 0)
    x = x + 50    
df = df.drop(columns=["Unnamed: 0", "Country", "Location", "Cuisine", "Cuisine1", "Cuisine2", "Longitude", "Latitude", "MinPrice", "MaxPrice", "AvgPrice"])
df = pd.merge(left=df,right=encode,left_index=True,right_index=True)
display(df)

Unnamed: 0,HasPhoneNum,HasWebsiteUrl,HasMultiCuisine,Award,MinPrice < 50,MinPrice < 100,MinPrice < 150,MinPrice < 200,MinPrice < 250,MinPrice < 300,...,Cuisine2_Thai and Vietnamese,Cuisine2_Traditional British,Cuisine2_Traditional Cuisine,Cuisine2_Tuscan,Cuisine2_Udon,Cuisine2_Umbrian,Cuisine2_Vegan,Cuisine2_Vegetarian,Cuisine2_Venetian,Cuisine2_World Cuisine
0,1,1,1,3,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,3,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6497,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
6498,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
6499,1,1,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
6500,1,1,1,0,0,1,1,1,1,1,...,0,0,1,0,0,0,0,0,0,0


In [20]:
#Split training and testing data into 70% and 30%
train = df.sample(frac=0.70, random_state=50)
test = df.drop(train.index)
#Assign attribute and target features
x_train = train.drop(columns=["Award"])
y_train = train["Award"]
x_test = test.drop(columns=["Award"])
y_test = test["Award"]

KNN Jaccard

In [26]:
from sklearn.model_selection import GridSearchCV
#Find the best n_neighbor via GridSearch Cross Val
n_neighbors = list(range(1,11))
ideal_n = dict(n_neighbors=n_neighbors)
opt_knn = KNeighborsClassifier(metric='jaccard')
find_n = GridSearchCV(opt_knn, ideal_n, cv=10)
model = find_n.fit(df.drop(columns=["Award"]),df["Award"])
print('Ideal N:', model.best_estimator_.get_params()['n_neighbors'])
k = model.best_estimator_.get_params()['n_neighbors']



Ideal N: 1


In [None]:
#Train and Predict KNN
knn = KNeighborsClassifier(n_neighbors=k, metric='jaccard')
knn.fit(x_train, y_train)
predictions = knn.predict(x_test)

#Display Confusion Matrix
cmx = confusion_matrix(y_test, predictions)
cmxd = ConfusionMatrixDisplay(cmx)
cmxd.plot()

In [None]:
#Print Metrics
print("Accuracy Score")
print(knn.score(x_test, y_test))
print("F1 Score")
print(f1_score(y_test, predictions, average='weighted'))

In [None]:
#Perform K-Fold Cross Validation
knn_cv = KNeighborsClassifier(n_neighbors=k, metric='jaccard')
cv_scores = cross_val_score(knn_cv, df.drop(columns=["Award"]), df["Award"], scoring='accuracy', cv=10)
print(cv_scores)
print(np.mean(cv_scores))

KNN Sokal Michener

In [None]:
from sklearn.model_selection import GridSearchCV
#Find the best n_neighbor via GridSearch Cross Val
n_neighbors = list(range(1,11))
ideal_n = dict(n_neighbors=n_neighbors)
opt_knn = KNeighborsClassifier(metric='sokalmichener')
find_n = GridSearchCV(opt_knn, ideal_n, cv=10)
model = find_n.fit(df.drop(columns=["Award"]),df["Award"])
print('Ideal N:', model.best_estimator_.get_params()['n_neighbors'])
k = model.best_estimator_.get_params()['n_neighbors']

In [None]:
#Train and Predict
knn = KNeighborsClassifier(n_neighbors=k, metric='sokalmichener')
knn.fit(x_train, y_train)
predictions = knn.predict(x_test)

#Plot Confusion Matrix
cmx = confusion_matrix(y_test, predictions)
cmxd = ConfusionMatrixDisplay(cmx)
cmxd.plot()
print(knn.score(x_test, y_test))

In [None]:
#Print Metrics
print("Accuracy Score")
print(knn.score(x_test, y_test))
print("F1 Score")
print(f1_score(y_test, predictions, average='weighted'))

In [None]:
#Perform K-Fold Cross Validation
knn_cv = KNeighborsClassifier(n_neighbors=k, metric='sokalmichener')
cv_scores = cross_val_score(knn_cv, df.drop(columns=["Award"]), df["Award"])
print(cv_scores)
print(np.mean(cv_scores))

KNN Russell Rao

In [None]:
from sklearn.model_selection import GridSearchCV
#Find the best n_neighbor via GridSearch Cross Val
n_neighbors = list(range(1,11))
ideal_n = dict(n_neighbors=n_neighbors)
opt_knn = KNeighborsClassifier(metric='russellrao')
find_n = GridSearchCV(opt_knn, ideal_n, cv=10)
model = find_n.fit(df.drop(columns=["Award"]),df["Award"])
print('Ideal N:', model.best_estimator_.get_params()['n_neighbors'])
k = model.best_estimator_.get_params()['n_neighbors']

In [None]:
#Train and Predict
knn = KNeighborsClassifier(n_neighbors=k, metric='russellrao')
knn.fit(x_train, y_train)
predictions = knn.predict(x_test)

#Plot Confusion Matrix
cmx = confusion_matrix(y_test, predictions)
cmxd = ConfusionMatrixDisplay(cmx)
cmxd.plot()

In [None]:
#Print Metrics
print("Accuracy Score")
print(knn.score(x_test, y_test))
print("F1 Score")
print(f1_score(y_test, predictions, average='weighted'))

In [None]:
#Perform K-Fold Cross Validation
knn_cv = KNeighborsClassifier(n_neighbors=k, metric='russellrao')
cv_scores = cross_val_score(knn_cv, df.drop(columns=["Award"]), df["Award"])
print(cv_scores)
print(np.mean(cv_scores))