- Use only the following columns 'job', 'marital', 'education', 'default', 'housing', 'loan','contact','month','day_of_week','poutcome','age','duration','euribor3m' where age, duration and euriborn3m are the numerical columns.
- Convert all categorical columns to numeric by using LabelEncoder()
- Standardize all the columns before using K-Prototype clustering
- Remember that you also need to convert the final dataframe to a matrix for applying K-Prototype.
- First check K-prototype with the number of clusters as 5.
- Please keep in mind that the code may take some time to execute as there are so many categorical variables, so be patient.

In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from kmodes.kprototypes import KPrototypes

In [85]:
df = pd.read_csv('bankmarketing.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [86]:
df.shape

(41188, 21)

In [87]:
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [88]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

Use only the following columns 'job', 'marital', 'education', 'default', 'housing', 'loan','contact','month','day_of_week','poutcome','age','duration','euribor3m' where age, duration and euriborn3m are the numerical columns.

In [89]:
df = df[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome',
            'age', 'duration', 'euribor3m']]

In [90]:
df.dtypes

job             object
marital         object
education       object
default         object
housing         object
loan            object
contact         object
month           object
day_of_week     object
poutcome        object
age              int64
duration         int64
euribor3m      float64
dtype: object

In [91]:
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,age,duration,euribor3m
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent,56,261,4.857
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent,57,149,4.857
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent,37,226,4.857
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent,40,151,4.857
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent,56,307,4.857


Convert all categorical columns to numeric by using LabelEncoder()

In [92]:
df_num = df[['age', 'duration', 'euribor3m']]
df_num.head()

Unnamed: 0,age,duration,euribor3m
0,56,261,4.857
1,57,149,4.857
2,37,226,4.857
3,40,151,4.857
4,56,307,4.857


In [101]:
df_num.describe()

Unnamed: 0,age,duration,euribor3m
count,41188.0,41188.0,41188.0
mean,40.02406,258.28501,3.621291
std,10.42125,259.279249,1.734447
min,17.0,0.0,0.634
25%,32.0,102.0,1.344
50%,38.0,180.0,4.857
75%,47.0,319.0,4.961
max,98.0,4918.0,5.045


In [93]:
df_cat = df.drop(['age', 'duration', 'euribor3m'], axis=1)
df_cat.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent


In [94]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in df_cat.columns:
    df_cat[col] = le.fit_transform(df_cat[col])

df_cat.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,3,1,0,0,0,0,1,6,1,1
1,7,1,3,1,0,0,1,6,1,1
2,7,1,3,0,2,0,1,6,1,1
3,0,1,1,0,0,0,1,6,1,1
4,7,1,3,0,0,2,1,6,1,1


In [95]:
### merging columns

df_encoded = pd.concat([df_num, df_cat], axis=1)
df_encoded.head()

Unnamed: 0,age,duration,euribor3m,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,56,261,4.857,3,1,0,0,0,0,1,6,1,1
1,57,149,4.857,7,1,3,1,0,0,1,6,1,1
2,37,226,4.857,7,1,3,0,2,0,1,6,1,1
3,40,151,4.857,0,1,1,0,0,0,1,6,1,1
4,56,307,4.857,7,1,3,0,0,2,1,6,1,1


Standardize all the columns before using K-Prototype clustering

In [96]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_encoded)
df_scaled = pd.DataFrame(df_scaled, columns=df_encoded.columns)
df_scaled.head()

Unnamed: 0,age,duration,euribor3m,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,1.533034,0.010471,0.71246,-0.201579,-0.283741,-1.753925,-0.5136,-1.087707,-0.452491,1.31827,0.762558,-0.718834,0.192622
1,1.628993,-0.421501,0.71246,0.911227,-0.283741,-0.34973,1.945327,-1.087707,-0.452491,1.31827,0.762558,-0.718834,0.192622
2,-0.290186,-0.12452,0.71246,0.911227,-0.283741,-0.34973,-0.5136,0.942127,-0.452491,1.31827,0.762558,-0.718834,0.192622
3,-0.002309,-0.413787,0.71246,-1.036184,-0.283741,-1.28586,-0.5136,-1.087707,-0.452491,1.31827,0.762558,-0.718834,0.192622
4,1.533034,0.187888,0.71246,0.911227,-0.283741,-0.34973,-0.5136,-1.087707,2.31144,1.31827,0.762558,-0.718834,0.192622


Remember that you also need to convert the final dataframe to a matrix for applying K-Prototype.

In [97]:
df_scaled.head()

Unnamed: 0,age,duration,euribor3m,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,1.533034,0.010471,0.71246,-0.201579,-0.283741,-1.753925,-0.5136,-1.087707,-0.452491,1.31827,0.762558,-0.718834,0.192622
1,1.628993,-0.421501,0.71246,0.911227,-0.283741,-0.34973,1.945327,-1.087707,-0.452491,1.31827,0.762558,-0.718834,0.192622
2,-0.290186,-0.12452,0.71246,0.911227,-0.283741,-0.34973,-0.5136,0.942127,-0.452491,1.31827,0.762558,-0.718834,0.192622
3,-0.002309,-0.413787,0.71246,-1.036184,-0.283741,-1.28586,-0.5136,-1.087707,-0.452491,1.31827,0.762558,-0.718834,0.192622
4,1.533034,0.187888,0.71246,0.911227,-0.283741,-0.34973,-0.5136,-1.087707,2.31144,1.31827,0.762558,-0.718834,0.192622


In [98]:
# Converting the dataset into matrix
df_matrix = df_scaled.values
df_matrix

array([[ 1.53303429,  0.01047142,  0.71245988, ...,  0.76255787,
        -0.71883445,  0.19262207],
       [ 1.62899323, -0.42150051,  0.71245988, ...,  0.76255787,
        -0.71883445,  0.19262207],
       [-0.29018564, -0.12451981,  0.71245988, ...,  0.76255787,
        -0.71883445,  0.19262207],
       ...,
       [ 1.53303429, -0.26722482, -1.49518647, ...,  1.19359295,
        -1.43436815,  0.19262207],
       [ 0.38152696,  0.70856893, -1.49518647, ...,  1.19359295,
        -1.43436815,  0.19262207],
       [ 3.26029527, -0.07438021, -1.49518647, ...,  1.19359295,
        -1.43436815, -2.56309793]])

First check K-prototype with the number of clusters as 5.

In [99]:
# Running K-Prototype clustering
kproto = KPrototypes(n_clusters=5, init='Cao')
clusters = kproto.fit_predict(df_matrix, categorical=[3,4,5,6,7,8,9,10,11,12])

In [100]:
print(kproto.cluster_centroids_)

[[-0.62216045 -0.26733156  0.68633146 -1.03618379 -0.2837415   1.05446393
  -0.51359969  0.94212743 -0.45249062 -0.75856997 -0.53054738 -0.00330075
   0.19262207]
 [-0.09571869  2.73806674  0.19281048 -1.03618379 -0.2837415   1.05446393
  -0.51359969  0.94212743 -0.45249062 -0.75856997  0.76255787  1.42776664
   0.19262207]
 [-0.6648193  -0.14310602 -1.40020174 -1.03618379  1.3585779   1.05446393
  -0.51359969  0.94212743 -0.45249062 -0.75856997  0.76255787 -0.71883445
   0.19262207]
 [ 0.76950456 -0.23206029  0.69462257 -0.75798228 -0.2837415  -0.34973033
  -0.51359969 -1.08770698 -0.45249062  1.31826996  0.76255787  0.71223295
   0.19262207]
 [ 1.53995676 -0.03512711 -1.45045228  0.35482378 -0.2837415   1.05446393
  -0.51359969  0.94212743 -0.45249062 -0.75856997  0.76255787 -0.00330075
   0.19262207]]


In [102]:
print(kproto.cost_)

129005.14725409947


In [None]:
#Choosing optimal K
cost = []
for num_clusters in list(range(1,8)):
    kproto = KPrototypes(n_clusters=num_clusters, init='Cao')
    kproto.fit_predict(bank_matrix, categorical=[3,4,5,6,7,8,9,10,11,12])
    cost.append(kproto.cost_)
    
plt.plot(cost)