# Improving Matches For A Dating Site
### Roman Ahmad Zeia

### Learning Outcomes
### We will be choosing different models (at least 3 models), evaluation methodologies  (e.g., cross-validation), performance metrics, and perform model selection, before evaluating the model on the test data set.

### We will be building 3 different models (Decision Tree, Nearest Neighbour and Support Vector Machine)

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, f1_score, make_scorer



In [2]:
#import data set and prepare data

test_data = pd.read_csv('datingData_test.txt', sep='\t', header=None)
train_data = pd.read_csv('datingData_training.txt',sep = '\t', header=None)

train_data.columns = ['FrequentFlyerMiles','VideoGameTime','IceCreamConsumed','Label']
test_data.columns = ['FrequentFlyerMiles','VideoGameTime','IceCreamConsumed','Label']



x = train_data.drop(['Label'], axis = 1)
y = train_data['Label']

x_test = test_data.drop(['Label'], axis = 1)
y_test = test_data['Label']


# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=None)

train_data.head()


Unnamed: 0,FrequentFlyerMiles,VideoGameTime,IceCreamConsumed,Label
0,40920,8.326976,0.953952,largeDoses
1,14488,7.153469,1.673904,smallDoses
2,26052,1.441871,0.805124,didntLike
3,75136,13.147394,0.428964,didntLike
4,38344,1.669788,0.134296,didntLike


In [3]:
# Decision Tree Model

DecisionTreeModel = DecisionTreeClassifier(criterion='gini', max_depth=5, random_state=None)

# 5-Fold Cross Validation
cv_scores_accuracy = cross_val_score(DecisionTreeModel, x, y, cv=10, scoring='accuracy')
cv_scores_precision = cross_val_score(DecisionTreeModel, x, y, cv=10, scoring='precision_weighted')
cv_scores_f1score = cross_val_score(DecisionTreeModel, x, y, cv=10, scoring='f1_weighted')

cv_accuracy_mean = np.mean(cv_scores_accuracy)
cv_precision_mean = np.mean(cv_scores_precision)
cv_scores_f1 = np.mean(cv_scores_f1score)

print("Cross Validation Results")
print(f"\n Accuracy: {cv_accuracy_mean}, Precision: {cv_precision_mean}, F1 Score: {cv_scores_f1}")



Cross Validation Results

 Accuracy: 0.9366666666666665, Precision: 0.9435427763502705, F1 Score: 0.9413755325351145


In [4]:
# Nearest Neighbor Model

# We will be using grid search cross validation to perform cross validation and to find the best k value for our Model simulataneously.


KneighboursModel = KNeighborsClassifier()

param_grid = {'n_neighbors': range(1,5)} 

grid_search = GridSearchCV(KneighboursModel, param_grid, cv=10, scoring='accuracy', return_train_score=False)
grid_search.fit(x, y)

best_k = grid_search.best_params_['n_neighbors']
best_score = grid_search.best_score_

print(f"Best K: {best_k}")
print(f"Accuracy: {best_score}")



Best K: 3
Accuracy: 0.7816666666666666


In [5]:
#Support Vector Machine

SVM = SVC(kernel='linear')

svm_cv_scores = cross_val_score(SVM, x, y, cv=10, scoring='accuracy')
svm_cv_avg = np.mean(svm_cv_scores)
print("Cross Validation Results")
print(f"\n Accuracy: {svm_cv_avg}")


Cross Validation Results

 Accuracy: 0.9066666666666666


### After evaluating all three models with cross validation we get the following results

### Decision Tree Accuracy: 93.6%
### Nearest Neighbor Accuracy: 78%
### SVM: 90.6%

### We will be using our Decision Tree model with parameters max depth of 5 and using gini index as it gives us the highest accuracy results

In [14]:
# Evaluating on test data set

DecisionTreeModel.fit(x, y)

prediction = DecisionTreeModel.predict(x_test)

y_actual = test_data['Label']

print("Predictions of model")
print(prediction)

accuracy = accuracy_score(y_actual, prediction)
print(f'\n Accuracy of the model: {accuracy:.2f} %')
prediction.size




Predictions of model
['smallDoses' 'didntLike' 'largeDoses' 'didntLike' 'smallDoses'
 'didntLike' 'largeDoses' 'didntLike' 'largeDoses' 'didntLike' 'didntLike'
 'smallDoses' 'smallDoses' 'smallDoses' 'largeDoses' 'smallDoses'
 'smallDoses' 'didntLike' 'smallDoses' 'largeDoses' 'largeDoses'
 'smallDoses' 'largeDoses' 'smallDoses' 'didntLike' 'smallDoses'
 'largeDoses' 'largeDoses' 'didntLike' 'largeDoses' 'smallDoses'
 'largeDoses' 'smallDoses' 'didntLike' 'smallDoses' 'didntLike'
 'didntLike' 'didntLike' 'smallDoses' 'largeDoses' 'smallDoses'
 'smallDoses' 'didntLike' 'smallDoses' 'smallDoses' 'didntLike'
 'largeDoses' 'didntLike' 'largeDoses' 'largeDoses' 'largeDoses'
 'smallDoses' 'smallDoses' 'largeDoses' 'largeDoses' 'didntLike'
 'smallDoses' 'smallDoses' 'smallDoses' 'largeDoses' 'didntLike'
 'smallDoses' 'didntLike' 'largeDoses' 'didntLike' 'smallDoses'
 'largeDoses' 'didntLike' 'didntLike' 'didntLike' 'smallDoses'
 'smallDoses' 'largeDoses' 'didntLike' 'largeDoses' 'didntLike'
 

400