**Problem Statement**

This data set has details about customers credit cards and we are going to cluster the customers data based on customer characterstics.

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score



In [None]:
# read the dataset
cc_df = pd.read_csv("../input/ccdata/CC GENERAL.csv", sep=",", encoding="ISO-8859-1", header=0)
cc_df.head()

In [None]:
cc_df.shape


In [None]:
cc_df.info()

In [None]:
cc_df = cc_df.dropna()
cc_df.shape

## EDA

In [None]:
sns.boxplot(cc_df['BALANCE'])
plt.show()

In [None]:
sns.boxplot(cc_df['PURCHASES'])
plt.show()

In [None]:
sns.boxplot(cc_df['INSTALLMENTS_PURCHASES'])
plt.show()

In [None]:
sns.boxplot(cc_df['CREDIT_LIMIT'])
plt.show()

In [None]:
sns.boxplot(cc_df['MINIMUM_PAYMENTS'])
plt.show()

In [None]:
sns.boxplot(cc_df['PURCHASES_FREQUENCY'])
plt.show()

In [None]:
cc_df.describe()

## Outlier Handling

Here in this case study it's better not to handle outliers because we want to analyse all types of customers so better not to handle them

## Hopkin Score

In [None]:
from sklearn.neighbors import NearestNeighbors
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
 
def hopkins(X):
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) # heuristic from article [1]
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H


In [None]:
hopkins(cc_df.drop('CUST_ID',axis=1))

**Data set can be clustered as the hopkins score is 90 percentage on the average**

In [None]:
#DRopping the unwanted features for the clustering
cc_df_data=cc_df.drop(['CUST_ID','BALANCE_FREQUENCY','PURCHASES_FREQUENCY','INSTALLMENTS_PURCHASES','ONEOFF_PURCHASES_FREQUENCY','CASH_ADVANCE','PURCHASES_INSTALLMENTS_FREQUENCY','CASH_ADVANCE_TRX','TENURE','PURCHASES_TRX','CASH_ADVANCE_FREQUENCY','PRC_FULL_PAYMENT'],axis=1)
cc_df_data.head()

## Scaling the Data

In [None]:
scaler = StandardScaler()

# fit_transform
cc_df_scaled = scaler.fit_transform(cc_df_data)
cc_df_scaled.shape

In [None]:
cc_df_scaled = pd.DataFrame(cc_df_scaled)
cc_df_scaled.columns = cc_df_data.columns
cc_df_scaled.head()

## Optimal Value of K

In [None]:
# elbow-curve/SSD
ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(cc_df_scaled)
    
    ssd.append(kmeans.inertia_)
    
# plot the SSDs for each n_clusters
# ssd
plt.plot(ssd)

In [None]:
# silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(cc_df_scaled)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(cc_df_scaled, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))
    
    

In [None]:
# final model with k=3
kmeans = KMeans(n_clusters=3, max_iter=50,random_state=50)
kmeans.fit(cc_df_scaled)

In [None]:
# assign the label
cc_df['cluster_id'] = kmeans.labels_
cc_df.head()

In [None]:
#CLuster Profiling
cc_df[['cluster_id','PURCHASES','BALANCE','ONEOFF_PURCHASES','CREDIT_LIMIT']].groupby('cluster_id').mean().plot(kind='bar')
plt.show()

- cluster2 customers uses credit card more 
- cluster0 customers uses credit card very less
- cluster1 customers uses credit card more than cluster0 but less than cluster 1

In [None]:
#plot data with seaborn
sns.scatterplot(x = 'CREDIT_LIMIT', y = 'PURCHASES', hue = 'cluster_id', data = cc_df, palette = 'Set1')
plt.show()