## Introduction

In this kernel we will discover customer segmentation using K-Means

## Import Modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(rc={'figure.figsize':(10,10)})
sns.set_style("whitegrid")
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('../input/german-credit/german_credit_data.csv', index_col=0)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

## EDA

In [None]:
## Age Dist

sns.distplot(df.Age, hist=True, rug=True,color='c')

In [None]:
## Gender Dist

df.Sex.value_counts().plot.bar()

In [None]:
## Customer by age and sex

ax = sns.boxplot(x="Sex", y="Age",
                 data=df, palette="viridis")

In [None]:
## Corr between number of checking account and Credit amount by Sex

ax = sns.violinplot(x="Checking account", y="Credit amount", hue='Sex',
                 data=df, palette="plasma")

In [None]:
## People housing for each Job and their credit amount

ax = sns.boxplot(x="Job", y="Credit amount", hue='Housing',
                 data=df, palette="plasma")

In [None]:
## Corr between number of checking account and Credit amount by Job / Skill

ax = sns.swarmplot(x="Checking account", y="Credit amount", hue='Job',
                 data=df, palette="spring")

In [None]:
## Corr between Credit amount and Purpose by Job

ax = sns.boxplot(x="Purpose", y="Credit amount", hue='Job',
                 data=df, palette="Pastel1")

In [None]:
## Corr between Credit Amount and Age by Sex

ax = sns.scatterplot(x="Credit amount", y="Age", hue='Sex',
                 data=df, palette="rainbow")

In [None]:
## Corr between Credit Amount and Age by Purpose

ax = sns.scatterplot(x="Credit amount", y="Age", hue='Purpose',
                 data=df, palette="jet_r")

## Feature Engineering

In [None]:
import sklearn.preprocessing as pre
from scipy.special import inv_boxcox
from scipy.stats import boxcox

In [None]:
## Categorical Data

df = pd.get_dummies(df)
df.head()

In [None]:
## heatmap

sns.heatmap(df.corr(), cmap='twilight')

In this dataset, i'm going to use 3 column as the cluster column. They are Age, Credit amount and Duration.

In [None]:
# Cluster column

Cluster = df.loc[:,["Age","Credit amount", "Duration"]]

In [None]:
## Scalling

fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize=(8,8))
print('Skew Value : ' + str(Cluster.Age.skew()))
sns.distplot(Cluster["Age"], ax=ax1)
print('Skew Value : ' + str(Cluster['Credit amount'].skew()))
sns.distplot(Cluster["Credit amount"], ax=ax2)
print('Skew Value : ' + str(Cluster.Duration.skew()))
sns.distplot(Cluster["Duration"], ax=ax3)
plt.tight_layout()

Handling skewed value in distribution of cluster column. I'm going to use 4 method in scalling, there are Log Transform, Log1 Transform, Box Cox transformation and Square Log Transform. When the skew value is nearing 0, it is the best method we could use in scalling.

In [None]:
def scalling(df, column):
    f = plt.figure(figsize=(15,13))

    # log 1 Transform
    ax = f.add_subplot(221)
    L1p = np.log1p(df[column])
    sns.distplot(L1p,color='b',ax=ax)
    ax.set_title('skew value Log 1 transform: ' + str(np.log1p(df[column]).skew()))

    # Square Log Transform
    ax = f.add_subplot(222)
    SRT = np.sqrt(df[column])
    sns.distplot(SRT,color='c',ax=ax)
    ax.set_title('Skew Value Square Transform: ' + str(np.sqrt(df[column]).skew()))

    # Log Transform
    ax = f.add_subplot(223)
    LT = np.log(df[column])
    sns.distplot(LT, color='r',ax=ax)
    ax.set_title('Skew value Log Transform: ' + str(np.log(df[column]).skew()))

    # Box Cox Transform
    ax = f.add_subplot(224)
    BCT,fitted_lambda = boxcox(df[column],lmbda=None)
    sns.distplot(BCT,color='g',ax=ax)
    ax.set_title('Skew Value Box Cox Transform: ' + str(pd.Series(BCT).skew()))

In [None]:
scalling(Cluster, 'Age')

In [None]:
scalling(Cluster, 'Credit amount')

In [None]:
scalling(Cluster, 'Duration')

After seeing all the result, the best skew value on all three clumn provided by Box Cox Transformation, therefore we are going to use Box Cox Transformation.

In [None]:
## Apply Transformation

Cluster['Age'],fitted_lambda = boxcox(Cluster['Age'],lmbda=None)
Cluster['Credit amount'], fitted_lambda = boxcox(Cluster['Credit amount'],lmbda=None)
Cluster['Duration'], fitted_lambda = boxcox(Cluster['Duration'],lmbda=None)
Cluster.head()

## Modelling

In this modelling, to determine k value, i'm using Elbow method.

In [None]:
from sklearn.cluster import KMeans 
from sklearn import metrics 
from scipy.spatial.distance import cdist
from mpl_toolkits.mplot3d import Axes3D

In [None]:
distortions = []
mapping1 = {}
K = range(1,15) 

for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(Cluster)
    kmeanModel.fit(Cluster)

    distortions.append(sum(np.min(cdist(Cluster, kmeanModel.cluster_centers_, 
                    'euclidean'),axis=1)) / Cluster.shape[0]) 

    mapping1[k] = sum(np.min(cdist(Cluster, kmeanModel.cluster_centers_, 
                'euclidean'),axis=1)) / Cluster.shape[0] 

In [None]:
for key,val in mapping1.items(): 
    print(str(key)+' : '+str(val)) 

In [None]:
plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('The Elbow Method using Distortion') 
plt.show() 

In elbow method we determine k value of k means by using distortion. If the graph would likely be stable onward, that k value is the best value in kmeans. In this case we are using 4 as number of cluster.

In [None]:
kmeans = KMeans(n_clusters = 4)
kmeans.fit(Cluster)
y_pred = kmeans.predict(Cluster)
print(kmeans.cluster_centers_)

In [None]:
df["label"] = kmeans.labels_
df.head()

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df["Credit amount"], df["Duration"], df["Age"], c=y_pred, cmap='jet_r')
ax.set_xlabel("Credit amount")
ax.set_ylabel("Duration")
ax.set_zlabel("Age")

In [None]:
## Detailed Overview

f = plt.figure(figsize=(15,13))
ax = f.add_subplot(311)
ax = sns.scatterplot(x="Credit amount", y="Duration", hue='label', data=df, palette="jet_r")
ax = f.add_subplot(312)
ax = sns.scatterplot(x="Age", y="Credit amount", hue='label', data=df, palette="jet_r")
ax = f.add_subplot(313)
ax = sns.scatterplot(x="Age", y="Duration", hue='label', data=df, palette="jet_r")

## Summary

Conclusion from this analysis is there is 4 kind of customer in this bank :

A. Brown Customer, they are a customer with long duration, mostly have a high amount of credit, this customer is the youngest costumer on this bank.

B. Yellow Customer, they have a 3rd high amount of credit, 3rd long duration, and 3rd youngest customers between all customer.

C. Cyan customer, they are a customer with 2nd high amount of credit, 2nd long of duration, and 2nd youngest age than any other cluster.

D. Dark Blue Customer have the lowest amount of credit in their account, shortest duration, and oldest customers in this bank.