In [259]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

In [130]:
df = pd.read_csv("Datasets/Social_Network_Ads.csv")
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]
X = pd.get_dummies(X, columns=[X.columns[0]], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=2)
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)
X_scale = scale.fit_transform(X)

In [131]:
# Code inspired by ikt215 assignment 3
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

def predict(X_train, y_train, x_test, k):
    distances = []
    targets = []

    for i, x_train in enumerate(X_train):
        distance = euclidean_distance(x_test, x_train)
        distances.append([distance, int(y_train.iloc[i])])
    
    distances = sorted(distances)
    
    for i in range(k):
        targets.append(distances[i][1])
    
    return np.argmax(np.bincount(targets))

predictions = []

for x_test in X_test:
    predictions.append(predict(X_train, y_train, x_test, 10))

cm = confusion_matrix(y_test, predictions)
accuracy = (cm[1,1] + cm[0,0]) / np.sum(cm)

print("Confusion Matrix: \n", cm)
print("Accuracy: ", accuracy)

Confusion Matrix: 
 [[45  3]
 [ 5 27]]
Accuracy:  0.9


In [132]:
k_fold_cross = KFold(n_splits=5, shuffle=True, random_state=2)

acc_for_k = []
for k in range(1,16):
    accuracy_list = []
    for i, j in k_fold_cross.split(X_scale):
        Xtrain, Xtest = X_scale[i], X_scale[j]
        ytrain, ytest = y[i], y[j]

        predict(Xtrain, y_train, Xtest, 3)

        predictions = []
        for x_test in Xtest:
            predictions.append(predict(X_train, y_train, x_test, k))

        cm = confusion_matrix(ytest, predictions)
        accuracy_list.append((cm[1,1] + cm[0,0]) / np.sum(cm))

    average_acc = np.mean(accuracy_list)
    acc_for_k.append((average_acc, k))
    print(f"Average accuracy for k = {k}:\t", average_acc)

print('\nBest k:', max(acc_for_k, key=lambda x: x[0])[1])

Average accuracy for k = 1:	 0.975
Average accuracy for k = 2:	 0.9175000000000001
Average accuracy for k = 3:	 0.9225
Average accuracy for k = 4:	 0.9125
Average accuracy for k = 5:	 0.9225
Average accuracy for k = 6:	 0.9125
Average accuracy for k = 7:	 0.9125
Average accuracy for k = 8:	 0.9125
Average accuracy for k = 9:	 0.9125
Average accuracy for k = 10:	 0.9175000000000001
Average accuracy for k = 11:	 0.9174999999999999
Average accuracy for k = 12:	 0.9175000000000001
Average accuracy for k = 13:	 0.9125
Average accuracy for k = 14:	 0.9125
Average accuracy for k = 15:	 0.9099999999999999

Best k: 1


In [239]:
def calculate_pca(X, components):
    X = (X - np.mean(X)) / np.std(X)
    
    covariance_matrix = np.cov(X, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    sort_by_value = (np.argsort(eigenvalues))[::-1]
    eigenvalues = eigenvalues[sort_by_value]
    eigenvectors = eigenvectors[:, sort_by_value]
    pca = np.dot(X, eigenvectors[:, :components])
    ratio = eigenvalues/sum(eigenvalues)

    return(pca, ratio)

print('Information in each component', calculate_pca(X_scale, 3)[1])

Information in each component [0.40008219 0.31856284 0.28135497]


In [277]:
X_train, X_test, y_train, y_test = train_test_split(calculate_pca(X, 1)[0], y, test_size = 0.2, random_state=0)
predictions = []

start = time.time()
for x_test in X_test:
    predictions.append(predict(X_train, y_train, x_test, 10))

elapsed = time.time() - start

cm = confusion_matrix(y_test, predictions)
accuracy = (cm[1,1] + cm[0,0]) / np.sum(cm)
print("Accuracy for PC1: ", accuracy)
print("Time: ", elapsed)

  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


Accuracy for PC1:  0.85
Time:  0.3210020065307617


In [278]:
X_train, X_test, y_train, y_test = train_test_split(calculate_pca(X, 2)[0], y, test_size = 0.2, random_state=0)
predictions = []

start = time.time()

for x_test in X_test:
    predictions.append(predict(X_train, y_train, x_test, 10))

elapsed = time.time() - start

cm = confusion_matrix(y_test, predictions)
accuracy = (cm[1,1] + cm[0,0]) / np.sum(cm)
print("Accuracy for PC2: ", accuracy)
print("Time: ", elapsed)

  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


Accuracy for PC2:  0.9
Time:  0.3269994258880615


In [273]:
X_train, X_test, y_train, y_test = train_test_split(calculate_pca(X, 3)[0], y, test_size = 0.2, random_state=0)
predictions = []

start = time.time()

for x_test in X_test:
    predictions.append(predict(X_train, y_train, x_test, 10))

elapsed = time.time() - start

cm = confusion_matrix(y_test, predictions)
accuracy = (cm[1,1] + cm[0,0]) / np.sum(cm)
print("Accuracy for PC3: ", accuracy)
print("Time: ", elapsed)

  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


Accuracy for PC3:  0.95
Time:  0.3357357978820801
