## K-Mode Clustering
### The data is related with direct marketing campaigns of a Bank. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from kmodes.kmodes import KModes
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
bank = pd.read_csv('../input/bank-marketing-propensity-data/bank-additional-full.csv')

In [None]:
bank.head()

In [None]:
bank.columns

In [None]:
## selecting categorical vars

bank_cust = bank[['age','job', 'marital', 'education', 'default', 'housing', 'loan','contact','month','day_of_week','poutcome']]

In [None]:
bank_cust.head()

In [None]:
## binning age column

bank_cust['age_bin'] = pd.cut(bank_cust['age'], [0, 20, 30, 40, 50, 60, 70, 80, 90, 100], 
                              labels=['0-20', '20-30', '30-40', '40-50','50-60','60-70','70-80', '80-90','90-100'])

In [None]:
## dropping age col

bank_cust = bank_cust.drop('age',axis = 1)

In [None]:
bank_cust.info()

### Data Pre-processing

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
bank_cust = bank_cust.apply(le.fit_transform)
bank_cust.head()

In [None]:
## jobs count per category

job_df = pd.DataFrame(bank_cust['job'].value_counts())

In [None]:
sns.barplot(x= job_df.index, y=job_df['job'] )
plt.show()

In [None]:
# Checking age count per category
age_df = pd.DataFrame(bank_cust['age_bin'].value_counts())

In [None]:
ax=sns.barplot(x=age_df.index, y=age_df['age_bin'])
plt.show()

### K-Mode with "Cao" initialization

In [None]:
km_cao = KModes(n_clusters=2, init = "Cao", n_init = 1, verbose=1)
fitClusters_cao = km_cao.fit_predict(bank_cust)

In [None]:
# Predicted Clusters

fitClusters_cao

In [None]:
clusterCentroidsDf = pd.DataFrame(km_cao.cluster_centroids_)
clusterCentroidsDf.columns = bank_cust.columns

In [None]:
# Mode of the clusters

clusterCentroidsDf

### K-Mode with "Huang" initialization

In [None]:
km_huang = KModes(n_clusters=2, init = "Huang", n_init = 1, verbose=1)
fitClusters_huang = km_huang.fit_predict(bank_cust)

In [None]:
# Predicted clusters

fitClusters_huang

### Choosing K by comparing Cost against each K

In [None]:
cost = []
for num_clusters in list(range(1,5)):
    kmode = KModes(n_clusters=num_clusters, init = "Cao", n_init = 1, verbose=1)
    kmode.fit_predict(bank_cust)
    cost.append(kmode.cost_)

In [None]:
y = np.array([i for i in range(1,5,1)])
plt.plot(y,cost);

#### we can see there are two elbows at 2 and 3, therefore we'll choose k=2

In [None]:
km_cao = KModes(n_clusters=2, init = "Cao", n_init = 1, verbose=1)
fitClusters_cao = km_cao.fit_predict(bank_cust)

In [None]:
fitClusters_cao

### Merge the predicted clusters with the original DF

In [None]:
bank_cust = bank_cust.reset_index()
clustersDf = pd.DataFrame(fitClusters_cao)
clustersDf.columns = ['cluster_predicted']
combinedDf = pd.concat([bank_cust, clustersDf], axis = 1).reset_index()
combinedDf = combinedDf.drop(['index', 'level_0'], axis = 1)

In [None]:
combinedDf.head()

### Seperate the data for cluster 1 & 2

In [None]:
# Data for Cluster1
cluster1 = combinedDf[combinedDf.cluster_predicted==1]

In [None]:
# Data for Cluster0
cluster0 = combinedDf[combinedDf.cluster_predicted==0]

In [None]:
cluster1.info()

In [None]:
cluster0.info()

In [None]:
# Checking the count per category for JOB

job1_df = pd.DataFrame(cluster1['job'].value_counts())

job0_df = pd.DataFrame(cluster0['job'].value_counts())

In [None]:
fig, ax =plt.subplots(1,2,figsize=(12,6))

a=sns.barplot(x=job1_df.index, y=job1_df['job'], ax=ax[0])
b=sns.barplot(x=job0_df.index, y=job0_df['job'], ax=ax[1])

fig.show()

In [None]:
# Checking the count per category for age

age1_df = pd.DataFrame(cluster1['age_bin'].value_counts())

age0_df = pd.DataFrame(cluster0['age_bin'].value_counts())

In [None]:
fig, ax =plt.subplots(1,2,figsize=(12,6))

a=sns.barplot(x=age1_df.index, y=age1_df['age_bin'], ax=ax[0])
b=sns.barplot(x=age0_df.index, y=age0_df['age_bin'], ax=ax[1])

fig.show()

In [None]:
cluster1['marital'].value_counts()

In [None]:
cluster0['marital'].value_counts()