In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly as py
import plotly.graph_objs as go
from sklearn.cluster import KMeans
import warnings
import os
warnings.filterwarnings("ignore")
py.offline.init_notebook_mode(connected = True)

# Reading and Preprocessing Dataset

In [42]:
df_data=pd.read_csv("Train.csv")
df_data

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A
...,...,...,...,...,...,...,...,...,...,...,...
8063,464018,Male,No,22,No,,0.0,Low,7.0,Cat_1,D
8064,464685,Male,No,35,No,Executive,3.0,Low,4.0,Cat_4,D
8065,465406,Female,No,33,Yes,Healthcare,1.0,Low,1.0,Cat_6,D
8066,467299,Female,No,27,Yes,Healthcare,1.0,Low,4.0,Cat_6,B


In [43]:
df_data = df_data.rename(columns={'Var_1': 'Category'})
df_data = df_data.drop(columns=['ID'])
df_data.head()

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Category,Segmentation
0,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


## Removing NA values

In [44]:
df_data.isna().sum()

Gender               0
Ever_Married       140
Age                  0
Graduated           78
Profession         124
Work_Experience    829
Spending_Score       0
Family_Size        335
Category            76
Segmentation         0
dtype: int64

In [45]:
df_data = df_data.dropna()
df_data.sample(10)

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Category,Segmentation
7335,Male,Yes,42,No,Entertainment,8.0,Low,2.0,Cat_4,A
7646,Female,No,52,Yes,Artist,1.0,Low,3.0,Cat_6,C
5418,Female,Yes,26,No,Lawyer,0.0,Low,9.0,Cat_7,C
7051,Male,Yes,49,Yes,Artist,1.0,Average,4.0,Cat_2,A
7174,Male,No,48,Yes,Entertainment,0.0,Low,3.0,Cat_2,A
939,Female,No,35,No,Entertainment,4.0,Low,2.0,Cat_6,D
4585,Female,Yes,59,No,Artist,1.0,Low,3.0,Cat_4,C
6071,Male,No,31,Yes,Healthcare,0.0,Low,3.0,Cat_3,D
2767,Female,No,53,Yes,Engineer,1.0,Low,2.0,Cat_6,A
6571,Male,No,25,Yes,Entertainment,1.0,Low,3.0,Cat_6,A


## Categorizing categorical columns

In [46]:
def get_encoded_dict(values):
    return {val:i for i,val in enumerate(values)}

In [47]:
gender_dict = get_encoded_dict(df_data.Gender.unique())
married_dict = get_encoded_dict(df_data.Ever_Married.unique())
grad_dict = get_encoded_dict(df_data.Graduated.unique())
profession_dict = get_encoded_dict(df_data.Profession.unique())
spend_dict = get_encoded_dict(df_data.Spending_Score.unique())
cat_dict = get_encoded_dict(df_data.Category.unique())
segment_dict = get_encoded_dict(df_data.Segmentation.unique())

In [48]:
encoded_dict = {
    'Gender': gender_dict,
    'Ever_Married': married_dict,
    'Graduated': grad_dict,
    'Profession': profession_dict,
    'Spending_Score': spend_dict,
    'Category': cat_dict,
    'Segmentation': segment_dict
}

In [49]:
df_data = df_data.replace(encoded_dict)
df_data.sample(10)

Unnamed: 0,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Category,Segmentation
152,1,1,47,1,1,0.0,2,2.0,1,2
590,1,0,43,1,3,2.0,0,4.0,1,1
5099,1,1,62,1,6,1.0,0,1.0,1,3
2394,0,1,65,0,6,0.0,2,2.0,1,2
60,0,1,48,1,3,0.0,2,2.0,1,2
5825,0,0,40,1,3,14.0,0,2.0,5,3
2451,0,1,51,1,3,1.0,0,3.0,1,2
4868,0,1,47,1,3,0.0,0,2.0,1,2
5158,1,1,45,1,3,3.0,0,3.0,1,2
2774,0,1,72,1,8,0.0,1,4.0,1,2


# Visualizing data

In [104]:
df=df_data
# df['label3'] =  df['Segmentation']

trace1 = go.Scatter3d(
    x= df['Age'],
    y= df['Profession'],
    z= df['Work_Experience'],
    mode='markers',
     marker=dict(
        color = df['Segmentation'], 
        size= 2,
        line=dict(
            color= df['Segmentation'],
            width= 12
        ),
        opacity=0.8
     )
)
data = [trace1]
layout = go.Layout(
    title= 'Data Visualization',
    scene = dict(
            xaxis = dict(title  = 'Age'),
            yaxis = dict(title  = 'Profession'),
            zaxis = dict(title  = 'Work_Experience')
        )
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

## Trying to see more features side-by-side

In [52]:
from itertools import combinations
df=df_data
for i in list(combinations(df.columns[:-2],3))[50:60]:
    trace1 = go.Scatter3d(
        x= df[i[0]],
        y= df[i[1]],
        z= df[i[2]],
        mode='markers',
         marker=dict(
            color = df['Segmentation'], 
            size= 2,
            line=dict(
                color= df['Segmentation'],
                width= 12
            ),
            opacity=0.8
         )
    )
    data = [trace1]
    layout = go.Layout(
        title= 'Clusters',
        scene = dict(
                xaxis = dict(title  = i[0]),
                yaxis = dict(title  = i[1]),
                zaxis = dict(title  = i[2])
            )
    )
    fig = go.Figure(data=data, layout=layout)
    py.offline.iplot(fig)

# Preparing Input/Output data

In [53]:
feature_cols = [i for i in df_data.columns]
target_col=[feature_cols.pop()]

## Scaling and Normalizing the data

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
data_x = df_data.loc[:, feature_cols]
data_y = df_data.loc[:, target_col]

In [55]:
print(data_x.shape)
print(data_y.shape)

(6665, 9)
(6665, 1)


---

# K-Means Scratch

In [56]:
def euclideanDistance(point1,point2):
    if(len(point1)!=len(point2)):
        print("ERROR ShapeMismatch: Got point1 with shape ",len(point1)," and point2 with shape ",len(point2),". Expected shape ",max(len(point1),len(point2)))
    else:
        distance = 0
        for i in range(len(point1)):
            distance+=(point1[i]-point2[i])**2
        distance=distance**(1/2)
        return distance

In [57]:
import random
from matplotlib import pyplot as plt
def kmeans(dataset,k=None,centroid=None,randomize=False):
    if(centroid==None and k!=None):
        if(randomize==False):
            centroid=dataset[0:k]
        else:
            centroid=[]
            for i in range(k):
                centroid.append(dataset[random.randint(0,len(dataset)-1)])
    elif(k==None and centroid!=None):
        k=len(centroid)
    else:
        print("Need centroid or some cluster to start with..")
    oldc=None
    count=0
    while(True):
#         print("Iteration ",count)
        c=[]
        for i in dataset:
            l=[]
            for j in range(k):
                l.append(euclideanDistance(centroid[j],i))
            c.append(l.index(min(l)))
        if(oldc==None):
            oldc=c
        else:
            flag=True
            for i in range(len(oldc)):
                if(oldc[i]!=c[i]):
                    flag=False
            if(flag):
                return centroid
                break
            oldc=c
        distance=[[ 0 for j in range(len(centroid[0]))] for i in range(k)]
        counter=[0 for i in range(k)]
        for i in range(len(c)):
            for j in range(len(centroid[0])):
                distance[c[i]][j]+=dataset[i][j]
            counter[c[i]]+=1
        for i in range(len(distance)):
            temp=[]
            for j in distance[i]:
                try:
                    temp.append(j/counter[i])
                except:
                    temp.append(0)
            centroid[i]=temp
        count+=1

In [58]:
random.seed(a=34)
KCentroid=kmeans(data_x.to_numpy(),k=4,randomize=True)

In [59]:
centroid=KCentroid
k=4
c=[]
for i in data_x.to_numpy():
    l=[]
    for j in range(k):
        l.append(euclideanDistance(centroid[j],i))
    c.append(l.index(min(l)))
predicted_y=c
actual_y = data_y.to_numpy().reshape(-1)

In [60]:
def Eval(pred,act):
    zero=[]
    one=[]
    two=[]
    three=[]

    predLabels=pred

    for i in range(len(predLabels)):
        if predLabels[i] == 0:
            zero.append(i)
        elif predLabels[i] == 1:
            one.append(i)
        elif predLabels[i] == 2:
            two.append(i)
        elif predLabels[i] == 3:
            three.append(i)

    zeroCombP=[]
    oneCombP=[]
    twoCombP=[]
    threeCombP=[]

    from itertools import combinations
    for i in combinations(zero,2):
        zeroCombP.append(i)

    for i in combinations(one,2):
        oneCombP.append(i)

    for i in combinations(two,2):
        twoCombP.append(i)

    for i in combinations(three,2):
        threeCombP.append(i)

    masterlistP=[]
    masterlistP.extend(zeroCombP)
    masterlistP.extend(oneCombP)
    masterlistP.extend(twoCombP)
    masterlistP.extend(threeCombP)

    zero=[]
    one=[]
    two=[]
    three=[]

    predLabels=act

    for i in range(len(predLabels)):
        if predLabels[i] == 0:
            zero.append(i)
        elif predLabels[i] == 1:
            one.append(i)
        elif predLabels[i] == 2:
            two.append(i)
        elif predLabels[i] == 3:
            three.append(i)

    zeroComb=[]
    oneComb=[]
    twoComb=[]
    threeComb=[]

    from itertools import combinations
    for i in combinations(zero,2):
        zeroComb.append(i)

    for i in combinations(one,2):
        oneComb.append(i)

    for i in combinations(two,2):
        twoComb.append(i)

    for i in combinations(three,2):
        threeComb.append(i)

    masterlist=[]
    masterlist.extend(zeroComb)
    masterlist.extend(oneComb)
    masterlist.extend(twoComb)
    masterlist.extend(threeComb)

    Set={i for i in masterlist}
    SetP={i for i in masterlistP}
    return len(SetP.intersection(Set))/len(masterlistP)

In [66]:
print(f"K-Means clustering accuracy compared with ground truth : {Eval(predicted_y,list(data_y['Segmentation']))}")

K-Means clustering accuracy compared with ground truth : 0.32567058575778907


# K-Means Library

In [73]:
from sklearn.cluster import KMeans
kmc = KMeans(n_clusters=4,init='random',n_init=4,random_state=34)
kmc.fit(data_x.to_numpy())

KMeans(init='random', n_clusters=4, n_init=4, random_state=34)

In [74]:
print(f"K-Means clustering Library version accuracy : {Eval(kmc.labels_,list(data_y['Segmentation']))}")

K-Means clustering Library version accuracy : 0.3242128979952658


- As observed, the library version also gives almost similar accuracy
- Let's go for TWO STAGE CLUSTERING

---

# Two Stage Clustering - Part 1 (Agglomerative + KMeans)

In [100]:
from sklearn.cluster import AgglomerativeClustering
agc = AgglomerativeClustering(linkage="complete", affinity="l1", n_clusters=5000, compute_distances=True)
agc.fit(data_x.to_numpy())

AgglomerativeClustering(affinity='l1', compute_distances=True,
                        linkage='complete', n_clusters=5000)

In [101]:
clustCount={}

buffLst=np.unique(agc.labels_,return_counts=True)

for i in range(len(buffLst[0])):
    clustCount[buffLst[0][i]]=buffLst[1][i] 

for i in buffLst[0]:
    globals()['data%s' %i] = pd.DataFrame(columns=df_data.columns[:-1])
    
for i in range(len(agc.labels_)):
    if clustCount[agc.labels_[i]]>4:
        globals()['data%s' %agc.labels_[i]]=globals()['data%s' %agc.labels_[i]].append(data_x.iloc[i])

In [102]:
import random
accList=[]
weights=[]
for s in buffLst[0]:
    varBuff=globals()['data%s' %s]
    if clustCount[s]>4:
        datazero=[]
        for i in varBuff.index:
            datazero.append(data_y.loc[i,'Segmentation'])

        random.seed(a=95)
        KCentroid=kmeans(varBuff.to_numpy(),k=4,randomize=True)
        centroid=KCentroid
        
        k=4
        c=[]
        for i in varBuff.to_numpy():
            l=[]
            for j in range(k):
                l.append(euclideanDistance(centroid[j],i))
            c.append(l.index(min(l)))
        predicted_y=c
        actual_y = datazero

        accList.append(Eval(predicted_y,actual_y))

print(np.mean(accList))

0.5454695767195766


---

# Two Stage Clustering - Part 2 (KMeans + KMeans)

In [97]:
from sklearn.cluster import KMeans
kmc = KMeans(n_clusters=5000,init='random',n_init=4,random_state=34)
kmc.fit(data_x.to_numpy())

KMeans(init='random', n_clusters=5000, n_init=4, random_state=34)

In [98]:
clustCount={}

buffLst=np.unique(kmc.labels_,return_counts=True)

for i in range(len(buffLst[0])):
    clustCount[buffLst[0][i]]=buffLst[1][i] 

for i in buffLst[0]:
    globals()['data%s' %i] = pd.DataFrame(columns=df_data.columns[:-1])
    
for i in range(len(kmc.labels_)):
    if clustCount[kmc.labels_[i]]>4:
        globals()['data%s' %kmc.labels_[i]]=globals()['data%s' %kmc.labels_[i]].append(data_x.iloc[i])

In [99]:
import random
accList=[]
for s in buffLst[0]:
    varBuff=globals()['data%s' %s]
    if clustCount[s]>4:
        datazero=[]
        for i in varBuff.index:
            datazero.append(data_y.loc[i,'Segmentation'])

        random.seed(a=95)
        KCentroid=kmeans(varBuff.to_numpy(),k=4,randomize=True)


        centroid=KCentroid
        k=4
        c=[]
        for i in varBuff.to_numpy():
            l=[]
            for j in range(k):
                l.append(euclideanDistance(centroid[j],i))
            c.append(l.index(min(l)))
        predicted_y=c
        actual_y = datazero

        accList.append(Eval(predicted_y,actual_y))

print(np.mean(accList))

0.5946608946608947
