In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to loazz

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA


In [None]:
#importing the dataset
df = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')

In [None]:
#what does the data look like ?
df.head(5)

In [None]:
df.info()

In [None]:
for col in df.columns:
    print( df[col].value_counts())

### stalk-root has 2480 missing values

In [None]:
# resolving the missing values defect
missing_stalk_root = (df['stalk-root'].value_counts()['?']/df.shape[0])
sns.displot( x = 'stalk-root',data = df)
plt.show()
print("Over {} of the data for stalk root is missing".format(missing_stalk_root*100))
print("It seems reasonable to drop the stalk root columns")

In [None]:
df.drop(['stalk-root'],axis = 1,inplace = True)

In [None]:
df.columns

In [None]:
# let us find the unique values in each columns
for col in df:
    print(col, df[col].unique())

* each record seems contains only categorical data

### what all will we analyse ??
1. Count of poisonous and non poisonous mushrooms
2. correlation of count of poi and non poi with each categorical feature ( hopefully get some insight out of that )

In [None]:
#count of poisonous and edible musrooms
ax = sns.countplot(x = "class",data = df)
plt.show()
print(df['class'].value_counts())

* The dataset is well balanced

In [None]:
#c orrelation of count of poi and non poi with each categorical feature 
for col in df.columns[1:]:
    plt.figure(figsize=(15,8))
    ax = sns.countplot(x="class", hue=col, data=df)
    ax.set_title(col)
    ax.legend(bbox_to_anchor= (0.9,1))
    plt.show()

### The set of redundant features are
1. Veil-type 


In [None]:
#removing redundant features
df.drop('veil-type',axis = 1,inplace = True)

Before proceeding further, let us shuffle the dataset

In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
#performing one hot encoding
df_x = df.iloc[:,1:]
df_y = df.iloc[:,0]



df_x_orig = df_x
df_y_orig = df_y

def convert_categorical_to_binary(df,columns):
    print(columns)
    df_temp = pd.DataFrame()
    n = df.shape[0];
    for col in columns:
        print(col)
        vec = df[col].unique();
        m = len(df[col].unique())-1;
        if(m == 0):
            continue;
        cat2bin = np.zeros((n,m));
        print(cat2bin.shape)
        for i in range(n):
            curr_category = df[col].loc[i]
#             print(curr_category)
            for j in range(m):
                if(curr_category == vec[j]):
                    cat2bin[i][j] = 1;
                    break
        df_temp = pd.concat((df_temp,pd.DataFrame(cat2bin)),axis = 1)
    return df_temp

df_x = convert_categorical_to_binary(df,df.columns[1:])

df_y = convert_categorical_to_binary(df,[df.columns[0]])

df_x.columns = np.array([i for i in range(df_x.shape[1])])

There seems to be just too many features, let us try to perform some dimensionality reduction technique

# Chi Square method for dimensionality reduction

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
f_p_values = chi2(df_x,df_y)
f_p_values = pd.DataFrame(f_p_values[0])


In [None]:
# f_p_values.sort_index(ascending = False,inplace = True,axis = 0)
f_p_values.columns = ['F Score']
f_p_values.sort_values(ascending = False,by=['F Score'], inplace=True)
f_p_values

In [None]:
f_p_values[f_p_values['F Score'] >= 100].shape

For now let us take all parameters with f score greater than 100

In [None]:
filtered_index = f_p_values[f_p_values['F Score'] >= 100].index

In [None]:
df_x = df_x[filtered_index]

In [None]:
train_size,test_size,cv_size = int(.7*df.shape[0]), int(.2*df.shape[0]),int(.1*df.shape[0])
print(train_size,test_size,cv_size)

x_train,x_test,x_cv = df_x[:train_size],df_x[train_size:train_size+test_size],df_x[train_size+test_size:]
y_train,y_test,y_cv = df_y[:train_size],df_y[train_size:train_size+test_size],df_y[train_size+test_size:]

# Model training

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# KNN

In [None]:
#Applying k - nn
from sklearn.neighbors import KNeighborsClassifier
neighCount = []
scores_test = []
scores_train = []
precision_test =[]
precision_train = []
recall_test = []
recall_train = []
f1_test = []
f1_train = []
for nc in range(1,100):
    neigh = KNeighborsClassifier(n_neighbors=nc)
    neighCount.append(nc)
    
    neigh.fit(x_train, y_train.values.ravel())
    
    y_train_pred = neigh.predict(x_train)
    y_test_pred = neigh.predict(x_test)
    
    precision_test.append(precision_score(y_test.values.ravel(),y_test_pred))
    recall_test.append(recall_score(y_test.values.ravel(),y_test_pred))
    f1_test.append(f1_score(y_test.values.ravel(),y_test_pred))
    
    precision_train.append(precision_score(y_train.values.ravel(),y_train_pred))
    recall_train.append(recall_score(y_train.values.ravel(),y_train_pred))
    f1_train.append(f1_score(y_train.values.ravel(),y_train_pred))
    
    scores_test.append(neigh.score(x_test, y_test.values.ravel()))
    scores_train.append(neigh.score(x_train,y_train.values.ravel()))

In [None]:
plt.figure(figsize=(8, 6), dpi=80)
plt.plot(neighCount, scores_train, c = 'r',label = "Train data")
plt.plot(neighCount,scores_test, c = 'b', label = "Test data")
plt.title("Accuracy versus number of neighbours ")
plt.legend()
plt.show()

In [None]:
np.argmax(scores_test)
print("Max accuracy for test set" , scores_test[np.argmax(scores_test)])
print("max number of nearest neighbours for best score " , neighCount[len(scores_test) - np.argmax(scores_test[::-1]) -1])

In [None]:
plt.figure(figsize=(8, 6), dpi=80)
plt.plot(neighCount, f1_train, c = 'r',label = "Train data")
plt.plot(neighCount,f1_test, c = 'b', label = "Test data")
plt.title("f1-score versus number of neighbours ")
plt.legend()
plt.show()

In [None]:
np.argmax(f1_test)
print("Max f1-score for test set" , f1_test[np.argmax(f1_test)])
print("max number of nearest neighbours for best score " , neighCount[len(f1_test) - np.argmax(f1_test[::-1]) -1])

In [None]:
plt.figure(figsize=(8, 6), dpi=80)
plt.plot(neighCount, precision_train, c = 'r',label = "Train data")
plt.plot(neighCount , precision_test, c = 'b', label = "Test data")
plt.title("precision versus number of neighbours ")
plt.legend()
plt.show()

In [None]:
np.argmax(precision_test)
print("Max f1-score for test set" , precision_test[np.argmax(precision_test)])
print("max number of nearest neighbours for best score "  , neighCount[len(precision_test) - np.argmax(precision_test[::-1]) -1])

In [None]:
plt.figure(figsize=(8, 6), dpi=80)
plt.plot(neighCount, recall_train, c = 'r',label = "Train data")
plt.plot(neighCount , recall_test, c = 'b', label = "Test data")
plt.title("recall versus number of neighbours ")
plt.legend()
plt.show()

In [None]:
np.argmax(recall_test)
print("Max recall-score for test set" , recall_test[np.argmax(recall_test)])
print("max number of nearest neighbours for best score" , neighCount[len(recall_test) - np.argmax(recall_test[::-1]) -1])

* Taking the number of nearest neightbours as 8 seems most apt in this case, since it gives the best f1-score on train and test size.

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
neigh = KNeighborsClassifier(n_neighbors=8)
neigh.fit(x_train, y_train.values.ravel())
y_test_pred = neigh.predict(x_test)
cm = confusion_matrix(y_test,y_test_pred)
print(classification_report(y_test, y_test_pred))
print(cm)

In [None]:
#Evaluation on the cross validation set
neigh = KNeighborsClassifier(n_neighbors=8)
neigh.fit(x_train, y_train.values.ravel())
y_cv_pred = neigh.predict(x_cv)
cm = confusion_matrix(y_cv,y_cv_pred)
print(classification_report(y_cv, y_cv_pred))
print(cm)

### Conclusion - We have succesfully created a KNN based model for mushroom classification
