In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [2]:
def splitdf(df):
    y = df[df.columns[-1]].to_numpy()
    x = df[df.columns[:-1]].to_numpy()
    return x,y

In [10]:
def euclid_dist(x,y):
    return np.sqrt(np.sum((np.array(x)-np.array(y))**2))

In [35]:
def KNN(k,traindf,xnew):
    x,y = splitdf(traindf)
    dists = []
    for i in range(len(x)):
        dists.append((y[i],euclid_dist(x[i],xnew)))
    dists.sort(key=lambda x:x[1])
    neighbors=[]
    for i in range(k):
        neighbors.append(dists[i][0])
    return neighbors

In [5]:
def KNN_predict(k,traindf,xtest):
    preds = []
    for xnew in xtest:
        neighbors = KNN(k,traindf,xnew)
        pred = max(neighbors,key=neighbors.count)
        preds.append(pred)
    return preds

In [6]:
spam_train = pd.read_csv("spam_train.csv")
spam_test = pd.read_csv("spam_test.csv")

In [7]:
spam_test = spam_test.iloc[: , 1:]
xtest,ytest = splitdf(spam_test)

In [19]:
k_values = [1, 5, 11, 21, 41, 61, 81, 101, 201, 401]

In [37]:
results = []
for k in k_values:
    ypred = KNN_predict(k,spam_train,xtest)
    results.append((k,accuracy_score(ytest,ypred)))
print(results)

[(1, 0.7522816166883963), (5, 0.7548891786179922), (11, 0.7648848326814428), (21, 0.7466318991742721), (41, 0.7522816166883963), (61, 0.7375054324206867), (81, 0.7266405910473707), (101, 0.7288135593220338), (201, 0.7314211212516297), (401, 0.7196870925684485)]


In [28]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [36]:
def zKNN(k,traindf,xnew):
    x,y = splitdf(traindf)
    x=scaler.fit_transform(x)
    dists = []
    for i in range(len(x)):
        dists.append((y[i],euclid_dist(x[i],xnew)))
    dists.sort(key=lambda x:x[1])
    neighbors=[]
    for i in range(k):
        neighbors.append(dists[i][0])
    return neighbors
def zKNN_predict(k,traindf,xtest):
    preds = []
    for xnew in xtest:
        neighbors = zKNN(k,traindf,xnew)
        pred = max(neighbors,key=neighbors.count)
        preds.append(pred)
    return preds

In [38]:
zxtest = scaler.fit_transform(xtest)
results = []
for k in k_values:
    ypred = zKNN_predict(k,spam_train,zxtest)
    results.append((k,accuracy_score(ytest,ypred)))
print(results)

[(1, 0.8231203824424164), (5, 0.8322468491960018), (11, 0.8748370273794003), (21, 0.8709256844850065), (41, 0.8704910908300739), (61, 0.8700564971751412), (81, 0.8696219035202086), (101, 0.8639721860060843), (201, 0.8461538461538461), (401, 0.8144285093437635)]


In [43]:
for i in range(50):
    predvalues = []
    for k in k_values:
        ypred = zKNN_predict(k,spam_train,zxtest)
        if ypred[i] == 1:
            predvalues.append("spam")
        else:
            predvalues.append("no-spam")
    print("t",i+1," ",predvalues)

t 1   ['spam', 'spam', 'spam', 'spam', 'spam', 'no-spam', 'no-spam', 'no-spam', 'no-spam', 'no-spam']
t 2   ['spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'no-spam', 'no-spam', 'no-spam']
t 3   ['spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam']
t 4   ['spam', 'spam', 'spam', 'spam', 'no-spam', 'no-spam', 'spam', 'spam', 'spam', 'spam']
t 5   ['spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam']
t 6   ['spam', 'spam', 'spam', 'no-spam', 'no-spam', 'spam', 'spam', 'spam', 'spam', 'spam']
t 7   ['spam', 'no-spam', 'no-spam', 'no-spam', 'no-spam', 'no-spam', 'no-spam', 'no-spam', 'no-spam', 'no-spam']
t 8   ['spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam']
t 9   ['spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam']
t 10   ['spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam']
t 11   ['spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 's