In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

In [16]:
df=pd.read_csv("Dataset2.csv")
# filling missing data
column_names=df.columns 
for a in column_names:
    df[a]=df[a].replace('?', np.NaN)    
df= df.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [17]:
num_data = df.select_dtypes(include=['int64'])
cat_data = df.select_dtypes(include=['object'])

num_columns = num_data.columns
cat_columns = cat_data.columns

print("Numerical Vaiables: ", num_columns)
print("Categorical Vaiables: ", cat_columns)

Numerical Vaiables:  Index([], dtype='object')
Categorical Vaiables:  Index(['poisonous', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


In [18]:
for c in df.columns:
    print ("---- %s ---" % c)
    print (df[c].value_counts())

---- poisonous ---
e    3383
p    3116
Name: poisonous, dtype: int64
---- cap-shape ---
x    2894
f    2542
k     671
b     362
s      26
c       4
Name: cap-shape, dtype: int64
---- cap-surface ---
y    2605
s    2026
f    1865
g       3
Name: cap-surface, dtype: int64
---- cap-color ---
n    1845
g    1487
e    1175
y     853
w     827
b     129
p     121
c      37
u      14
r      11
Name: cap-color, dtype: int64
---- bruises ---
f    3777
t    2722
Name: bruises, dtype: int64
---- odor ---
n    2828
f    1728
y     461
s     461
l     326
a     321
p     199
c     147
m      28
Name: odor, dtype: int64
---- gill-attachment ---
f    6330
a     169
Name: gill-attachment, dtype: int64
---- gill-spacing ---
c    5450
w    1049
Name: gill-spacing, dtype: int64
---- gill-size ---
b    4508
n    1991
Name: gill-size, dtype: int64
---- gill-color ---
b    1372
p    1198
w     979
n     863
h     590
g     574
u     394
k     310
e      72
y      72
o      56
r      19
Name: gill-color, dty

In [19]:
def mapping(df , flag):
    if flag == 0:
        df['poisonous']= df['poisonous'].map({'p': 0, 'e': 1}).astype(int)
    df['cap-shape']= df['cap-shape'].map({'x': 0, 'f': 1, 'k': 2, 'b': 3,'s': 4, 'c': 5}).astype(int)
    df['cap-surface']= df['cap-surface'].map({'y': 0, 's': 1, 'f': 2, 'g': 3}).astype(int)
    df['cap-color']= df['cap-color'].map({'n': 0, 'g': 1, 'e': 2, 'y': 3,'w': 4, 'b': 5, 'p': 6, 'c': 7, 'u': 8,'r': 9}).astype(int)
    df['bruises']= df['bruises'].map({'f': 0, 't': 1}).astype(int)
    df['odor']= df['odor'].map({'n': 0, 'f': 1, 's': 2, 'y': 3,'l': 4, 'a': 5, 'p': 6,'c': 7, 'm': 8}).astype(int)
    df['gill-attachment']= df['gill-attachment'].map({'f': 0, 'a': 1}).astype(int)
    df['gill-spacing']= df['gill-spacing'].map({'c': 0, 'w': 1}).astype(int)
    df['gill-size']= df['gill-size'].map({'b': 0, 'n': 1}).astype(int)
    df['gill-color']= df['gill-color'].map({'b': 0, 'p': 1, 'w': 2, 'n': 3,'h': 4, 'g': 5, 'u': 6, 'k': 7, 'y': 8,'e': 9, 'o': 10, 'r':11}).astype(int)
    df['stalk-shape']= df['stalk-shape'].map({'t': 0, 'e': 1}).astype(int)
    df['stalk-root']= df['stalk-root'].map({'b': 0, 'e': 1, 'c': 2, 'r': 3}).astype(int)
    df['stalk-surface-above-ring']= df['stalk-surface-above-ring'].map({'s': 0, 'k': 1, 'f': 2, 'y': 3}).astype(int)
    df['stalk-surface-below-ring']= df['stalk-surface-below-ring'].map({'s': 0, 'k': 1, 'f': 2, 'y': 3}).astype(int)
    df['stalk-color-above-ring']= df['stalk-color-above-ring'].map({'w': 0, 'p': 1, 'g': 2, 'n': 3,'b': 4, 'o': 5, 'y': 6, 'c': 7,'e': 8}).astype(int)
    df['stalk-color-below-ring']= df['stalk-color-below-ring'].map({'w': 0, 'p': 1, 'g': 2, 'n': 3,'b': 4, 'o': 5, 'y': 6, 'c': 7,'e': 8}).astype(int)
    df['veil-type']= df['veil-type'].map({'p': 0}).astype(int)
    df['veil-color']= df['veil-color'].map({'w': 0, 'n': 1, 'o': 2, 'y': 3}).astype(int)
    df['ring-number']= df['ring-number'].map({'o': 0, 'n': 1, 't': 2}).astype(int)
    df['ring-type']= df['ring-type'].map({'p': 0, 'e': 1, 'l': 2, 'f': 3, 'n': 4}).astype(int)
    df['spore-print-color']= df['spore-print-color'].map({'w': 0, 'n': 1, 'k': 2, 'h': 3,'r': 4, 'b': 5, 'u': 6, 'y': 7, 'o': 8}).astype(int)
    df['population']= df['population'].map({'v': 0, 'y': 1, 's': 2, 'a': 3, 'n': 4, 'c': 5}).astype(int)
    df['habitat']= df['habitat'].map({'d': 0, 'p': 1, 'g': 2, 'l': 3,'u': 4, 'm': 5, 'w': 6}).astype(int)
    return df
    

In [20]:
df = mapping(df , 0)

In [21]:
X = df.drop('poisonous', axis=1)
y = df['poisonous']
# Splitting to training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
neigh_3 = KNeighborsClassifier(n_neighbors=3)
neigh_3.fit(X_train, y_train)
y_pred_3 = neigh_3.predict(X_test)
print("accurcy for 3 is ", accuracy_score(y_test,y_pred_3)*100)


accurcy for 3 is  100.0


In [23]:
neigh_5 = KNeighborsClassifier(n_neighbors=5)
neigh_5.fit(X_train, y_train)
y_pred_5 = neigh_5.predict(X_test)
print("accurcy for 5 is ", accuracy_score(y_test,y_pred_5)*100)


accurcy for 5 is  99.92307692307692


In [24]:
neigh_10 = KNeighborsClassifier(n_neighbors=10)
neigh_10.fit(X_train, y_train)
y_pred_10 = neigh_10.predict(X_test)
print("accurcy for 10 is ", accuracy_score(y_test,y_pred_10)*100)


accurcy for 10 is  99.6923076923077


In [25]:
df_test=pd.read_csv("Dataset2_Unknown.csv")
# filling missing data
column_names=df_test.columns 
for a in column_names:
    df_test[a]=df_test[a].replace('?', np.NaN)
    
df_test= df_test.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [26]:
df_test = mapping(df_test ,1 )

In [27]:
y_pred_test_10 = neigh_10.predict(df_test)
y_pred_test_5 = neigh_5.predict(df_test)
y_pred_test_3 = neigh_3.predict(df_test)

In [30]:
np.savetxt("k_3.csv", y_pred_test_3, delimiter=",")
np.savetxt("k_5.csv", y_pred_test_5, delimiter=",")
np.savetxt("k_10.csv", y_pred_test_10, delimiter=",")

y_predict_3 = []
y_predict_5 = []
y_predict_10 = []
for i in range(len(y_pred_test_10)):
    if y_pred_test_3[i] == 0:
        y_predict_3.append('p')
    else:
        y_predict_3.append('e')
    
    if y_pred_test_5[i] == 0:
        y_predict_5.append('p')
    else:
        y_predict_5.append('e')
    
    if y_pred_test_10[i] == 0:
        y_predict_10.append('p')
    else:
        y_predict_10.append('e')

        
        
df_k3 = pd.DataFrame(y_predict_3) 
df_k5 = pd.DataFrame(y_predict_5)
df_k10 = pd.DataFrame(y_predict_10)

df_k3.to_csv('k3_predict.csv')
df_k5.to_csv('k5_predicte.csv')
df_k10.to_csv('k10_predicte.csv')



