# Goal 

Trying to understand k-modes clustering. 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from kmodes.kmodes import KModes

### Categorical data 
Kmodes library takes care of the encoding decoding of categorical variables

In [2]:
x = ["Dog", "Blue", "Female", "Sad"]
y = ["Cat", "Yellow", "Male", "Happy"]
z = ["Sheep", "Yellow", "Male", "Happy"]
a = ["Sheep", "Yellow", "Female", "Happy"]

df2 = pd.DataFrame([x,y,z,a], columns= ["Pet", "Sky", "Gender", "Feeling"])

In [3]:
df2

Unnamed: 0,Pet,Sky,Gender,Feeling
0,Dog,Blue,Female,Sad
1,Cat,Yellow,Male,Happy
2,Sheep,Yellow,Male,Happy
3,Sheep,Yellow,Female,Happy


In [4]:
km_2 = KModes(n_clusters=2, init="Huang")
km_2.fit_predict(df2)

array([1, 0, 0, 0], dtype=uint16)

In [5]:
km_2.cluster_centroids_

array([['Sheep', 'Yellow', 'Male', 'Happy'],
       ['Dog', 'Blue', 'Female', 'Sad']], dtype='<U6')

In [6]:
km_2.labels_

array([1, 0, 0, 0], dtype=uint16)

In [8]:
df2["cluster_id"] = km_2.labels_

In [9]:
df2

Unnamed: 0,Pet,Sky,Gender,Feeling,cluster_id
0,Dog,Blue,Female,Sad,1
1,Cat,Yellow,Male,Happy,0
2,Sheep,Yellow,Male,Happy,0
3,Sheep,Yellow,Female,Happy,0


<br/>
<br/>


### Example 2

In [10]:
x = [0,1,0]
y = [0,1,1]
z = [1,0,1]
a = [1,0,1]
b = [1,0,0]

df = pd.DataFrame([x,y,z, a, b], columns= ["Pet", "Sky", "Gender"])

In [11]:
df

Unnamed: 0,Pet,Sky,Gender
0,0,1,0
1,0,1,1
2,1,0,1
3,1,0,1
4,1,0,0


In [12]:
km = KModes(n_clusters=2, init='Huang')

In [13]:
result = km.fit_predict(df)

In [14]:
km.cluster_centroids_

array([[1, 0, 1],
       [0, 1, 0]])

In [15]:
km.labels_

array([1, 1, 0, 0, 0], dtype=uint16)

<br/>
<br/>


### Lets try categorical and continuous data together 

In [17]:
iris_df = pd.read_csv("../input_data/iris.csv")
iris_df.head()

Unnamed: 0,Id,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,1,5.1,3.5,1.4,0.2,setosa
1,2,4.9,3.0,1.4,0.2,setosa
2,3,4.7,3.2,1.3,0.2,setosa
3,4,4.6,3.1,1.5,0.2,setosa
4,5,5.0,3.6,1.4,0.2,setosa


In [20]:
from kmodes.kprototypes import KPrototypes

In [21]:
kP = KPrototypes(n_clusters=3, init='Huang', n_init=1, verbose=True)

In [27]:
kP.fit_predict(iris_df, categorical=[5])

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 31, ncost: 32283.194501888276
Run: 1, iteration: 2/100, moves: 9, ncost: 31425.200637564303
Run: 1, iteration: 3/100, moves: 3, ncost: 31326.797400000007
Run: 1, iteration: 4/100, moves: 0, ncost: 31326.797400000007


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint16)

In [28]:
kP.cluster_centroids_

[array([[125.5  ,   6.588,   2.974,   5.552,   2.026],
        [ 25.5  ,   5.006,   3.428,   1.462,   0.246],
        [ 75.5  ,   5.936,   2.77 ,   4.26 ,   1.326]]), array([['virginica'],
        ['setosa'],
        ['versicolor']], dtype='<U10')]

In [30]:
iris_df["cluster_id"] = kP.labels_

In [34]:
iris_df[iris_df.Species == 'versicolor']

Unnamed: 0,Id,SepalLength,SepalWidth,PetalLength,PetalWidth,Species,cluster_id
50,51,7.0,3.2,4.7,1.4,versicolor,2
51,52,6.4,3.2,4.5,1.5,versicolor,2
52,53,6.9,3.1,4.9,1.5,versicolor,2
53,54,5.5,2.3,4.0,1.3,versicolor,2
54,55,6.5,2.8,4.6,1.5,versicolor,2
55,56,5.7,2.8,4.5,1.3,versicolor,2
56,57,6.3,3.3,4.7,1.6,versicolor,2
57,58,4.9,2.4,3.3,1.0,versicolor,2
58,59,6.6,2.9,4.6,1.3,versicolor,2
59,60,5.2,2.7,3.9,1.4,versicolor,2


Works! 