In [3]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

def cross_validation(datapts,gtsplit):
    datapts_split = np.array_split((datapts),10)
    gtsplit = np.array_split((gtsplit),10)
    return datapts_split,gtsplit

def inputdata(index,datasplit,gtsplit):
  train = np.array(np.vstack([x for i,x in enumerate(datasplit) if i != index]))
  trainclasslabel = np.array(np.concatenate([x for i,x in enumerate(gtsplit) if i != index]))
  test = np.array(datasplit[index])
  testclasslabel = np.array(gtsplit[index])
  # print("cv train type",train.dtype)
  return train,trainclasslabel,test,testclasslabel

#normalization definition
def normalize(df):                    #normalizaion function
    result = df.copy()
    for column in df.columns:
        max_value = df[column].max()
        min_value = df[column].min()
        result[column] = (df[column] - min_value) / (max_value - min_value)
    return result

def Accuracy(y_true,y_pred):
  count = 0
  for i in range(len(y_pred)):
    if y_pred[i] == y_true[i]:
      count +=1
  accuracy = count / float(len(y_true))
  print("accuray:",accuracy)
  return accuracy

def Recall(y_true,y_pred):
    cm = ConfusionMatrix(y_true,y_pred)
    recall = np.diag(cm) / np.sum(cm, axis = 1)
    recall = np.mean(recall)
    print("recall:",recall) 
    return recall

def Precision(y_true,y_pred):
    cm = ConfusionMatrix(y_true,y_pred)
    precision = np.diag(cm) / np.sum(cm, axis = 0)
    precision = np.mean(precision)
    print("precision:",precision)
    return precision

def fscore(prec,recall):
    # cm = ConfusionMatrix(y_true,y_pred)
    f1score = 2 * (prec * recall) / (prec + recall)
    print("f1score:",f1score)
    return f1score

def ConfusionMatrix(y_true,y_pred):
    y_pred = np.array(y_pred)
    y_true = np.asarray(y_true)
    target = len(np.unique(y_true))
    a = (y_true * target) + y_pred
    g = np.sort(a)
    sq = target * target
    hist, bin_edges = np.histogram(a, bins=range(g[0].astype('int'), g[0].astype('int')+ sq + 1))
    hist = np.reshape(hist, (target, target))
    return hist


def KNN(X_train,X_test,Y_train, k):
    n = len(X_test)
    dists = np.zeros((n, len(X_train)))
    y_pred = np.zeros(n)
    dists = np.sqrt((X_test**2).sum(axis=1)[:, np.newaxis] + (X_train**2).sum(axis=1) - 2 * X_test.dot(X_train.T))
    for i in range(n):
       k_neighbour = []
       index_arr = np.argsort(dists[i, :], axis = 0)
       k_neighbour = Y_train[index_arr[:k]].tolist()
       y_pred[i] = (max(set(k_neighbour), key = k_neighbour.count))
  
    return y_pred

#demo execution
# pf1 = pd.read_csv("/content/project3_dataset3_train.txt", delimiter = "\t", header=None)
# pf2 = pd.read_csv("/content/project3_dataset3_test.txt", delimiter = "\t", header=None)
# x_train = pf1.iloc[:,:-1]
# y_train = pf1.iloc[:,-1]
# x_test = pf2.iloc[:,:-1]
# y_test = pf2.iloc[:,-1]
# x_train = x_train.values
# y_train = y_train.values
# x_test = x_test.values
# y_test = y_test.values
# k = [2,7,8,9]
# for i in k:
#   print("k =",i)
#   y_pred = KNN(x_train,x_test,y_train,int(i))
#   Accuracy(y_test,y_pred)
#   prec = Precision(y_test,y_pred)
#   rec = Recall(y_test,y_pred)
#   fscore(prec,rec)


#project dataset exection
pf1 = pd.read_csv("/content/project3_dataset1.txt", delimiter = "\t", header=None)      #change the file path for new file

features = pf1.iloc[:,:-1]          #train features split
train_labels = pf1.iloc[:,-1]
k = 3                                                             #change the k value as required
#processing to resolve categorical values
for index,element in pf1.iteritems():
  if element.dtype == np.object:
    features = pd.concat([features,pd.get_dummies(element,prefix = index)],axis = 1)
    features.drop([index],axis=1,inplace=True)
features = features.astype(float)


#train labels split
features = normalize(features)          #normalizing the train data
# print(features)

datafold,classfold = cross_validation(features,train_labels)
avg_accuracy = avg_precision = avg_recall = avg_f1score = 0
for index in range(10):
  x_train, y_train, x_test, y_test = inputdata(index,datafold,classfold)
  result = KNN(x_train,x_test,y_train,k)
  knn_confusion_matrix = ConfusionMatrix(y_test, result)
  acc = Accuracy(y_test, result)
  prec = Precision(y_test, result)
  recall = Recall(y_test, result)
  f1score = fscore(prec,recall)
  print("\n")

# print(result)
  avg_accuracy += acc
  avg_precision += prec
  avg_recall += recall
  avg_f1score += f1score


print("average accuracy:",avg_accuracy * 10 )
print("average pecision:",avg_precision * 10)
print("average recall:",avg_recall * 10)
print("average f1score:",avg_f1score * 0.1)




accuray: 0.9824561403508771
precision: 0.9857142857142858
recall: 0.9782608695652174
f1score: 0.9819734345351044


accuray: 0.9473684210526315
precision: 0.9316239316239316
recall: 0.9455882352941176
f1score: 0.938554144264183


accuray: 0.9649122807017544
precision: 0.9777777777777779
recall: 0.9285714285714286
f1score: 0.9525395503746878


accuray: 0.9824561403508771
precision: 0.9864864864864865
recall: 0.9761904761904762
f1score: 0.9813114754098361


accuray: 0.9122807017543859
precision: 0.9205882352941177
recall: 0.8864864864864865
f1score: 0.9032155896714028


accuray: 0.9649122807017544
precision: 0.96875
recall: 0.962962962962963
f1score: 0.9658478130617136


accuray: 1.0
precision: 1.0
recall: 1.0
f1score: 1.0


accuray: 0.9824561403508771
precision: 0.9883720930232558
recall: 0.9666666666666667
f1score: 0.9773988897700238


accuray: 0.9473684210526315
precision: 0.9482758620689655
recall: 0.9516129032258065
f1score: 0.9499414519906323


accuray: 0.9642857142857143
precision: