In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
data.head()

In [None]:
data.info()

In [None]:
for i in data.columns :
    print(i)
    print(data[i].value_counts())
    print('-'*10)

#### Since there is only one unique value for veil-type, we can drop it.
#### Since a lot of values of stalk root are missing, we will drop it.

In [None]:
data = data.drop(['veil-type', 'stalk-root'],axis=1)

In [None]:
import matplotlib
matplotlib.rcParams.update({'font.size': 22})
i = 1
plt.figure(figsize=(40,80))
for col in data.columns :
    if col != 'class' :
        plt.subplot(10,3,i)
        i += 1
        sns.countplot(x='class', hue=col, data=data)

matplotlib.rcParams.update({'font.size': 12})

### Conclusions that can be drawn :
* A mushroom without bruises is more likely to be poisonous than a mushroom with no bruises
* A mushroom with no odor is likely to be edible whereas a mushroom with fihsy odor is likely to be poisonous
* A mushroom with large rings is likely to be poisonous whereas a mushroom with pendant ring is likely to be edible
* A mushroom with population of type several is likely to be poisonous
* A mushroom with habitat as path is likely to be posionous
* A mushroom with spore print color of chocolate or white is likely to be poisonouse whereas of color black or brown is likely to be edible

### Separating target and data

In [None]:
X, Y = data.drop(['class'],axis=1), data[['class']]

### Encoding target 

In [None]:
Y['class'] = (Y['class'] == 'e').astype('int8')
Y.head()

In [None]:
X.head()

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for col in X.columns:
    X[col] = label_encoder.fit_transform(X[col])

In [None]:
X.head()

### One-hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

n,m = X.shape

X_encoded = np.ones((n,0))
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

t = 0
cols = {}

for col in X.columns :
    n = len(X[col].unique()) - 1
    cols[col] = (t, t+n)
    t += n
    X_encoded = np.hstack((X_encoded, OH_encoder.fit_transform(X[[col]])[:,:-1]))

X_encoded = pd.DataFrame(X_encoded)

In [None]:
X_encoded.head()

In [None]:
data = pd.DataFrame(X_encoded)
data['class'] = Y['class']
data.head()

In [None]:
plt.figure(figsize=(30,2))
sns.heatmap(data.corr().loc[['class'],:], cmap='coolwarm')

In [None]:
idx = []
corr = data.corr().loc[['class'],:]
for col in data.corr().loc[['class'],:] :
    if col != 'class' and abs(corr.iloc[0][col]) > 0.3 :
        idx.append(col)
len(idx)

In [None]:
X_data = X_encoded[idx]

In [None]:
high_corr_features = set()
for i in idx :
    for k in cols :
        if cols[k][0] <= i < cols[k][1] :
            high_corr_features.add(k)

In [None]:
print(high_corr_features)

### We conclude that the following 11 features out of the 22 features are the most indicative of a posionous mushroom
1. stalk-surface-below-ring (silky)
2. odor (foul)
3. ring-type (large)
4. gill-size (broad)
5. habitat (path)
6. spore-print-color (chocolate, white)
7. bruises (no)
8. population (several)
9. gill-spacing (close)
10. gill-color (buff)
11. stalk-surface-above-ring (silky)

In [None]:
X_data.head()

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X_data,Y,test_size=0.2)

In [None]:
print(x_train.shape,y_train.shape, x_test.shape, y_test.shape)

### Training and Testing the Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

test_accuracy,train_accuracy = [],[]

for i in range(1,20) :
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train, y_train)
    train_accuracy.append(knn.score(x_train, y_train))
    test_accuracy.append(knn.score(x_test, y_test))

In [None]:
plt.plot(test_accuracy)
plt.plot(train_accuracy)
plt.legend(['test','train'])
plt.ylabel('accuracy')
plt.xlabel('k')
plt.show()

### Taking the value of k to be **_5_**

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)

Y_predicted = knn.predict(x_test)

In [None]:
Y_predicted = ['p' if pred == 1 else 'e' for pred in Y_predicted]

In [None]:
prediction = pd.DataFrame(Y_predicted, columns=['class'])
prediction.head()