Attribute Information:

1. cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y
4. bruises?: bruises=t,no=f
5. odor: almond=a,anise=l,creosote=c,fishy=y,foul=f, musty=m,none=n,pungent=p,spicy=s
6. gill-attachment: attached=a,descending=d,free=f,notched=n
7. gill-spacing: close=c,crowded=w,distant=d
8. gill-size: broad=b,narrow=n
9. gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e, white=w,yellow=y
10. stalk-shape: enlarging=e,tapering=t
11. stalk-root: bulbous=b,club=c,cup=u,equal=e, rhizomorphs=z,rooted=r,missing=?
12. stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
13. stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
14. stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
15. stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o, pink=p,red=e,white=w,yellow=y
16. veil-type: partial=p,universal=u
17. veil-color: brown=n,orange=o,white=w,yellow=y
18. ring-number: none=n,one=o,two=t
19. ring-type: cobwebby=c,evanescent=e,flaring=f,large=l, none=n,pendant=p,sheathing=s,zone=z
20. spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r, orange=o,purple=u,white=w,yellow=y
21. population: abundant=a,clustered=c,numerous=n, scattered=s,several=v,solitary=y
22. habitat: grasses=g,leaves=l,meadows=m,paths=p, urban=u,waste=w,woods=d

In [None]:
#Those are libraries that I used
import sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Visualization libraries
from matplotlib import pyplot as plt
import seaborn

# For manupulating the data
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# Models under the Sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

# to calculate the performances of the models 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score # score evaluation
from sklearn.model_selection import cross_val_predict # prediction

In [None]:
# Loading the data into the notebook
# With respect to kaggle working directory the data in 'mushroom-classification' folder
mush = pd.read_csv('../input/mushroom-classification/mushrooms.csv')
mush.info()

In [None]:
# Data types of attributes
mush.dtypes

In [None]:
# Quick overlook to the data
mush.head()

In [None]:
# checking the is there any 'na' data points in the data 
mush.isnull().sum()

In [None]:
# Plotting the number of classes as in bar plot
plt.figure(figsize=(7, 3))
plt.bar(mush['class'].value_counts().index, mush['class'].value_counts().values,color=['orange','pink'])
plt.show()

In [None]:
# To do manipulation on data we need to copy of data
df = mush.copy()
# Data has ordinal attributes, The LabelEncoder has been used to make into numerical values  
Lb = LabelEncoder()
# iterating the encoding all the attributes
for features in df.columns:
    df[features] = Lb.fit_transform(df[features])
df.head()

In [None]:
# 
df[-100:-1][['class']].value_counts().plot(kind='bar')

In [None]:
# Training varaibles
X = df.drop(columns=['class'],axis=1)[:-100]
# Training classes
Y = df['class'][:-100]
# Testing variables with 100 samples
x = df.drop(columns=['class'], axis=1)[-100:]
# Testing classe with 100 samples
y = df['class'][-100:] 

In [None]:
# As in needed my task I have to apply 2-Fold cross validation
kfold = KFold(n_splits=2, random_state=42, shuffle=True)
train_scores = []
for i in range(2,11):
    knn = KNeighborsClassifier(n_neighbors=i)
    cv_result = cross_val_score(knn,X,Y,cv=kfold, scoring="accuracy")
    train_scores.append(cv_result.mean())
knn_models_performances = pd.DataFrame({'Mean_training_scores':train_scores},index=range(2,11))       

In [None]:
plt.plot(knn_models_performances.Mean_training_scores,color='orange')
plt.ylabel('Mean accuracies')
plt.xlabel('n_neighbors')
plt.title('Accuricies with respect to number of neighbors')
plt.legend()
plt.show()

In [None]:
# After the getting the results the accurcy has the best result on 2 neighbors
knn = KNeighborsClassifier(n_neighbors=2)
training_score = []
testing_score = []

# For training the KNN on 2 neighbors
for train_index, test_index in kfold.split(X):
#     print(train_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    knn.fit(X_train,Y_train)
    training_score.append(knn.score(X,Y))
    train_predictions = knn.predict(X_test)
    testing_score.append(accuracy_score(Y_test, train_predictions))

In [None]:
# Results scores
print("training_score{training_score}".format(training_score=training_score))
print("testing_score{testing_score}".format(testing_score=testing_score))

In [None]:
# Validation part of KNN
val_predictions = knn.predict(x)
cm = confusion_matrix(y, val_predictions)
seaborn.heatmap(cm, annot=True)

In [None]:
# Validation scores with using 'classification_report'
print(classification_report(y,val_predictions))

In [None]:
# Due to kmeans is a unsupervised learning algorithm no need to split the data into folds
# Just feeding with training attributes
# Aim is classification so clustering number is choosed as 2, n_init is choosed randomly, n_jobs is choosed -1 for using all cpus
kmeans = KMeans(2,init='k-means++',n_init=100,n_jobs=-1)
# training the kmeans
kmeans.fit(X)

# Getting the labels which are trained on the data and training classes into the confusion matrix
cm_1 = confusion_matrix(Y,kmeans.labels_)

# Printing and plotting the results to better see
print(cm_1)
plt.figure(figsize=(7,7))
seaborn.heatmap(cm_1,annot=True, cmap='Blues')
plt.show()

In [None]:
# Classification report of training
print(classification_report(Y,kmeans.labels_))

In [None]:
# Validation part
val_preds = kmeans.predict(x)
cm_2 = confusion_matrix(y,val_preds)
print(cm_2)

seaborn.heatmap(cm_2,annot=True)

In [None]:
# Validation results
print(classification_report(y,val_preds))