In [3]:
"""
Problem 4. KNN (20 pts)
1)	Implement a basic KNN model on the yeast dataset. The task is to predict the compartment in a cell 
that a yeast protein will localize to base on the properties of its sequence. Do not use Scikit-learn.
2)	To optimize the results, test with Manhattan and Euclidean distance metrics. 
"""
import numpy as np
from collections import Counter


class KNN:

    def __init__(self, k=3, distance_algo="euclidean"):
        self.distance_algo = distance_algo
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predicted_labels = [self._predict(x) for x in X]
        return np.array(predicted_labels)

    def calc_euclidean_dist(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def calc_manhattan_dist(self, x1, x2):
        return np.sum(np.absolute(x1 - x2))

    def _predict(self, x):
        distances_from_all_other_points = []
        if (self.distance_algo == "euclidean"):
            distances_from_all_other_points = [
                self.calc_euclidean_dist(x, x_train) for x_train in self.X_train]
        else:
            distances_from_all_other_points = [
                self.calc_manhattan_dist(x, x_train) for x_train in self.X_train]

        # Taking only the most nearest k points to compare with
        k_indices = np.argsort(distances_from_all_other_points)[:self.k]
        neighbour_labels = [self.y_train[i] for i in k_indices]
        votes = Counter(neighbour_labels).most_common(1)
        return votes[0][0]


In [2]:
from numpy import genfromtxt
import numpy as np
import pandas as pd

total_data = genfromtxt('./yeast.data')
total_data[:, 1:8][1:5]
names = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'label']
df = pd.read_csv('./yeast.data', header=None,
                 delim_whitespace=True, names=names)
df['class_int'] = pd.Categorical(df['label']).codes
y = np.array(df['class_int'])
X = total_data[:, 1:8]


In [9]:
# Splitting the data set into test and train data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)


In [10]:
classifier = KNN(3)
classifier.fit(X, y)
predictions = classifier.predict(X)
acc = np.sum(predictions == y) / len(y)
print("Accuracy of KNN on yeast dataset with euclidean distance for k = 3 is : ", acc)


Accuracy of KNN on yeast dataset with euclidean distance for k = 3 is :  0.8254716981132075


In [13]:
"""
3)	Report the model's accuracy for both distance metrics with k values from 5 to 20. 
"""
for i in range(5, 21):
    knn_classifier_euclidean = KNN(i, distance_algo="euclidean")
    knn_classifier_euclidean.fit(X_train, y_train)
    predictions_made_euclidean = knn_classifier_euclidean.predict(X_test)
    accuracy_euclidean = np.sum(
        predictions_made_euclidean == y_test) / len(y_test)

    knn_classifier_manhattan = KNN(i, distance_algo="manhattan")
    knn_classifier_manhattan.fit(X_train, y_train)
    predictions_made_manhattan = knn_classifier_manhattan.predict(X_test)
    accuracy_manhattan = np.sum(
        predictions_made_manhattan == y_test) / len(y_test)

    print("Accuracy for Euclidean metrics for k=",
          i, " is : ", accuracy_euclidean)
    print("Accuracy for Manhattan metrics for k=", i,
          " is : ", accuracy_manhattan, end="\n\n")


Accuracy for Euclidean metrics for k= 5  is :  0.5336322869955157
Accuracy for Manhattan metrics for k= 5  is :  0.5358744394618834

Accuracy for Euclidean metrics for k= 6  is :  0.5336322869955157
Accuracy for Manhattan metrics for k= 6  is :  0.5493273542600897

Accuracy for Euclidean metrics for k= 7  is :  0.547085201793722
Accuracy for Manhattan metrics for k= 7  is :  0.5269058295964125

Accuracy for Euclidean metrics for k= 8  is :  0.5582959641255605
Accuracy for Manhattan metrics for k= 8  is :  0.5426008968609866

Accuracy for Euclidean metrics for k= 9  is :  0.5381165919282511
Accuracy for Manhattan metrics for k= 9  is :  0.5493273542600897

Accuracy for Euclidean metrics for k= 10  is :  0.5605381165919282
Accuracy for Manhattan metrics for k= 10  is :  0.5650224215246636

Accuracy for Euclidean metrics for k= 11  is :  0.5426008968609866
Accuracy for Manhattan metrics for k= 11  is :  0.5560538116591929

Accuracy for Euclidean metrics for k= 12  is :  0.547085201793722


In [39]:
"""
Problem 5. Gaussian Process (20 pts)
1)	Construct Scikit-learn Gaussian Process models using basic, RBF, and Matern kernels. 
"""
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import cross_val_score
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RBF

column_names = ['s' + str(x) for x in range(1,33)]

df = pd.read_csv('./wdbc.data', names=column_names)
df_arr = np.asarray(df)
x = pd.get_dummies(df['s2'])
x = x.drop('B', axis=1)
df['s2'] = x
df.head()
X = df.drop('s2',axis=1)
y=df['s2']

In [42]:
X

Unnamed: 0,s1,s3,s4,s5,s6,s7,s8,s9,s10,s11,...,s23,s24,s25,s26,s27,s28,s29,s30,s31,s32
0,842302,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [40]:
from sklearn.model_selection import KFold  ## Importing KFold Library
kf = KFold(5)

In [52]:
from sklearn.gaussian_process.kernels import RBF, Matern

In [67]:
# 2)	Implement a 5-fold cross-validation algorithm to report the model and accuracy of each fold.
# 3)	Report the result. 
# RBF Kernel
gau = GaussianProcessClassifier(kernel=RBF())
from sklearn.model_selection import cross_val_score
g_reg = cross_val_score(gau, X,y, scoring='accuracy', cv=kf)
g_reg = abs(g_reg) 
print("Accuracy : ", g_reg) 


Accuracy :  [0.40350877 0.57017544 0.64912281 0.74561404 0.7699115 ]


In [66]:
# Matern Kernel
gau = GaussianProcessClassifier(kernel=Matern())
from sklearn.model_selection import cross_val_score
g_reg = cross_val_score(gau, X,y, scoring='accuracy', cv=kf)
g_reg = abs(g_reg) 
print("Accuracy is : ", g_reg)                      


Accuracy is :  [0.4122807  0.57894737 0.64912281 0.74561404 0.7699115 ]


In [68]:
## Basic kernel
gau = GaussianProcessClassifier() 
from sklearn.model_selection import cross_val_score
g_reg = cross_val_score(gau, X,y, scoring='accuracy', cv=kf)
g_reg = abs(g_reg)                       
print("Accuracy is", g_reg)

Accuracy is [0.40350877 0.57017544 0.64912281 0.74561404 0.7699115 ]
