In [None]:
#install kmodes library
!pip install kmodes

In [None]:
#update sklearn
!pip install scikit-learn==0.24.2

## Data Description

* cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
* cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
* cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y
* bruises: bruises=t,no=f
* odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s
* gill-attachment: attached=a,descending=d,free=f,notched=n
* gill-spacing: close=c,crowded=w,distant=d
* gill-size: broad=b,narrow=n
* gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y
* stalk-shape: enlarging=e,tapering=t
* stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?
* stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s
* stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s
* stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y
* veil-type: partial=p,universal=u
* veil-color: brown=n,orange=o,white=w,yellow=y
* ring-number: none=n,one=o,two=t
* ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z
* spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y
* population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y
* habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d


## Import Libraries

In [None]:
#import EDA tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#import modeling tools and metrics
from kmodes.kprototypes import KPrototypes
from kmodes.kmodes import KModes
from sklearn.metrics.cluster import rand_score

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Add Dataset

In [None]:
#read CSV
df = pd.read_csv("../input/mushroom-classification/mushrooms.csv")
df.head()

## EDA

In [None]:
#find the missing values
df.isna().sum()

#### NICE! No missing values :))

In [None]:
#copy the dataframe to keep the real one.
df1 = df.copy()

In [None]:
df1.info()

In [None]:
#Let's convert them into category values
for label,content in df1.items():
  if pd.api.types.is_string_dtype(content):
    df1[label]=content.astype("category").cat.as_ordered()

In [None]:
df1.info()

In [None]:
#Turn categorical values into numbers
for label, content in df1.items():
    if not pd.api.types.is_numeric_dtype(content):
        df1[label] =pd.Categorical(content).codes

In [None]:
df1

Class 1 is `p` and 
Class 0 is `e`

In [None]:
#split data into X,y
X=df1.drop("class", axis=1)
y=df1["class"]

In [None]:
#Correlation matrix
fig = plt.subplots(figsize=(15,8))

sns.heatmap(X.corr(),
            annot=True,
            fmt="0.2f",
            cmap="inferno")

In [None]:
X.nunique()

In [None]:
#drop not important features
X=df1.drop("class", axis=1)
drops_list = ["veil-type", "cap-shape", "cap-color", "veil-color", "gill-attachment"]
X.drop(drops_list, axis=1, inplace=True)

## Visualizing

In [None]:
#compare all of features dataframe with target 
for label in X.keys():
    pd.crosstab(df1[label],
           df1["class"]).plot(kind="bar",
                             color=["blue", "salmon"],
                             figsize=(4,3))
    plt.title(label)

## Data Modeling

#### NOTE
* We will use KModes and KPrototype because it is based on categories but KMeans is based on distance and it is not suitable for our dataset

* k-modes is used for clustering categorical variables. It defines clusters based on the number of matching categories between data points. (This is in contrast to the more well-known k-means algorithm, which clusters numerical data based on Euclidean distance.) The k-prototypes algorithm combines k-modes and k-means and is able to cluster mixed numerical / categorical data.

In [None]:
#modeling with KModes
kmodes = KModes(n_clusters=2, init='Cao', verbose=1)
clusters = kmodes.fit_predict(X)
clusters

In [None]:
#KModes evaluation
rand_score(y,clusters)

In [None]:
#Modeling with KPrototypes
kproto = KPrototypes(n_clusters=2, init='Cao', verbose=0, random_state=42,max_iter=20, n_init=50,n_jobs=-2) 
clusters1 = kproto.fit_predict(X, categorical=[0])

In [None]:
#KPrototypes evaluation
rand_score(y,clusters1)

### The best Rand Index is 81.80 (KModes)