In [218]:
import pandas as pd
df = pd.read_csv('../data/MiceProtein_2f2c.csv')
df.head(5)

Unnamed: 0,SCI1,NATR2,Response
0,42.207707,5.453817,P
1,38.970469,5.015322,P
2,39.224086,5.078089,P
3,33.271147,5.819807,P
4,31.183694,5.290515,P


In [173]:
## Dataset: Expression values of SCI1 and NATR2 proteins from cerebrospinal fluid (CSF) of control and trisomic mice
## Mice are treated with a specific drug to stimulate learning. 
## The Response variable shows the response of the mice to the treatment: P: Positive - N: Negative

In [None]:
## let's generate the scatter plot of the data
import matplotlib.pyplot as plt
plt.figure(figsize = (max(df['SCI1']), max(df['NATR2'])))
plt.xlabel('SCI1', fontsize = 40)
plt.ylabel('NATR2', fontsize = 40)
plt.title('Mice protein', fontsize = 40)
# plt.setp(ax.get_xticklabels(), fontsize=40)
# plt.setp(ax.get_yticklabels(), fontsize=40)
targets = ['P', 'N']
colors = ['b', 'r']
for target, color in zip(targets,colors):
    indices = df['Response'] == target
    plt.scatter(df.loc[indices, 'SCI1']
               , df.loc[indices, 'NATR2']
               , c = color
               , s = 400)
plt.legend(targets, fontsize=40)
plt.grid()
## We have a new mouse with expression values 35 and 11.5 for SCI1 and NATR2, respectively.
## Would that mice respond to the drug?
## plt.plot(35, 11.5, color='black', marker='*', markersize=40)

In [None]:
## let's take a closer look at the neighborhood of the new sample
## zooming into the [30,40] range in SCI1 and [7.5,15] range in NATR2 axis
plt.figure(figsize = (10, 7.5)) 
plt.xlabel('SCI1', fontsize = 15)
plt.ylabel('NATR2', fontsize = 15)
plt.title('Mice protein', fontsize = 20)
plt.xlim(30,40)
plt.ylim(7.5,15)
targets = ['P', 'N']
colors = ['b', 'r']
for target, color in zip(targets,colors):
    indices = df['Response'] == target
    plt.scatter(df.loc[indices, 'SCI1']
               , df.loc[indices, 'NATR2']
               , c = color
               , s = 200)
plt.legend(targets)
plt.grid()
plt.plot(35, 11.5, color='black', marker='*', markersize=20)
## plot a circle centered in the new sample - set markersize 80, 120, 220
marker_style = dict(color='black', marker='o', markersize=120, fillstyle='none')
plt.plot(35, 11.5, **marker_style)
## plt.savefig("../results/knn_zoom.pdf")

In [221]:
## k-NN implementation step by step
## Calculate Euclidean distances
test_x_SCI1 = 35
test_x_NATR2 = 11.5
Euc_dist = ((df['SCI1'] - test_x_SCI1)**2 + (df['NATR2'] - test_x_NATR2)**2)**0.5

In [222]:
Euc_df = pd.DataFrame({'Euc_dist': Euc_dist})
df_dist = pd.concat([df, Euc_df], axis=1)
df_sortedDist = df_dist.sort_values('Euc_dist')
print(df_sortedDist)

          SCI1      NATR2 Response   Euc_dist
286  35.328458  10.970759        N   0.622881
133  34.410275  11.146482        P   0.687568
57   36.020573  11.687102        P   1.037582
287  34.208460  10.412262        N   1.345255
272  36.369440  12.354002        N   1.613904
..         ...        ...      ...        ...
102  11.362494   5.179193        P  24.468026
189  10.513913   6.811383        N  24.930937
190  10.042702   6.863643        N  25.384297
193   7.663928   6.859300        N  27.727187
192   4.705512   5.837582        N  30.819134

[300 rows x 4 columns]


In [223]:
## predict using different k values and compare the predictions
k = 7
from collections import Counter
c = Counter(df_sortedDist.head(k)['Response'])
print(c)
## majority voting
value, count = c.most_common()[0]
print(value)
## calculate probabilities assuming equal weights for neighbors
pr_P = c['P'] / k
pr_N = c['N'] / k

Counter({'N': 4, 'P': 3})
N


In [229]:
# k-NN using the package
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7)
X = df[['SCI1', 'NATR2']].values
y = df[['Response']].values
print(X.shape)
neigh.fit(X, np.ravel(y,order='C'))
print(neigh.predict([[35, 11.5]]))

(300, 2)
['N']
