# 1. Data Description
* CUSTID : Identification of Credit Card holder (Categorical)
* BALANCE : Balance amount left in their account to make purchases
* BALANCEFREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
* PURCHASES : Amount of purchases made from account
* ONEOFFPURCHASES : Maximum purchase amount done in one-go
* INSTALLMENTSPURCHASES : Amount of purchase done in installment
* CASHADVANCE : Cash in advance given by the user
* PURCHASESFREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
* ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
* PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
* CASHADVANCEFREQUENCY : How frequently the cash in advance being paid
* CASHADVANCETRX : Number of Transactions made with "Cash in Advanced"
* PURCHASESTRX : Numbe of purchase transactions made
* CREDITLIMIT : Limit of Credit Card for user
* PAYMENTS : Amount of Payment done by user
* MINIMUM_PAYMENTS : Minimum amount of payments made by user
* PRCFULLPAYMENT : Percent of full payment paid by user
* TENURE : Tenure of credit card service for user

# 2. EDA tool install(dataprep)

In [None]:
!pip install dataprep
%%hide output

# 3. Load library

In [None]:
import pandas as pd
import numpy as np
import math

from dataprep.eda import create_report

from sklearn.cluster import KMeans,DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
import matplotlib.cm as cm

# 4. Check dataframe and EDA

In [None]:
def countnull(data):
    count = data.isnull().sum(axis=0)
    count = pd.DataFrame(count).T
    print(count)

In [None]:
df = pd.read_csv('../input/ccdata/CC GENERAL.csv')
print(df.head(5))
print(df.shape)

In [None]:
create_report(df)

> Save EDA result (format : HTML)

In [None]:
create_report(df).save('./EDA')

In [None]:
countnull(df)

# 5. Preprocessing (Remove Nan value, StandardScaler)

In [None]:
df.drop(['CUST_ID', 'MINIMUM_PAYMENTS'], axis='columns', inplace=True)

In [None]:
df.dropna(axis='rows', inplace=True)

In [None]:
# sc = StandardScaler()
# df_sc = sc.fit_transform(df_sc)

mc = MinMaxScaler()
df_mc = mc.fit_transform(df)

In [None]:
pca = PCA(n_components=11)
df_mc_pca = pca.fit_transform(df_mc)

In [None]:
print('singular value : ', pca.singular_values_)
print('singular vector :\n ', pca.components_.T)
print('eigen_value : ',pca.explained_variance_)
print('explained variance ratio : ',pca.explained_variance_ratio_)

> How to select n_components of PCA function

In [None]:
cumsum = np.cumsum(pca.explained_variance_ratio_)
print(cumsum)
d = np.argmax(cumsum >= 0.95) + 1
print("dimension : ", d)

In [None]:
pca = PCA(n_components=d)
df_mc_pca = pca.fit_transform(df_mc)

# 6. Optimize number of cluster (Elbow method)

In [None]:
# find the optimal number of clusters using elbow method

WCSS = []

for i in range(1,11):
    model = KMeans(n_clusters=i, init='k-means++')
    model.fit(df_mc_pca)
    WCSS.append(model.inertia_)

fig = plt.figure(figsize = (14,7))
plt.plot(range(1,11), WCSS, linewidth=4, markersize=12, marker='o', color = 'green')
plt.xticks(np.arange(11))
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()

# 7. Optimize number of cluster (Silhouette_score)

In [None]:
def visualize_silhouette(cluster_lists, X_features): 

    n_cols = len(cluster_lists)
    fig, axs = plt.subplots(figsize=(4*n_cols, 4), nrows=1, ncols=n_cols)

    for ind, n_cluster in enumerate(cluster_lists):
        clusterer = KMeans(n_clusters = n_cluster, max_iter=500, random_state=0, init = 'k-means++')
        cluster_labels = clusterer.fit_predict(X_features)

        sil_avg = silhouette_score(X_features, cluster_labels)
        sil_values = silhouette_samples(X_features, cluster_labels)

        y_lower = 10
        axs[ind].set_title('Number of Cluster : '+ str(n_cluster)+'\n' \
                          'Silhouette Score :' + str(round(sil_avg,3)) )
        axs[ind].set_xlabel("The silhouette coefficient values")
        axs[ind].set_ylabel("Cluster label")
        axs[ind].set_xlim([-0.1, 1])
        axs[ind].set_ylim([0, len(X_features) + (n_cluster + 1) * 10])
        axs[ind].set_yticks([])  # Clear the yaxis labels / ticks
        axs[ind].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])
   
        for i in range(n_cluster):
            ith_cluster_sil_values = sil_values[cluster_labels==i]
            ith_cluster_sil_values.sort()

            size_cluster_i = ith_cluster_sil_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_cluster)
            axs[ind].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, \
                                facecolor=color, edgecolor=color, alpha=0.7)
            axs[ind].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10

        axs[ind].axvline(x=sil_avg, color="red", linestyle="--")

In [None]:
visualize_silhouette([2, 3, 4, 5, 6, 7, 8, 9, 10], df_mc_pca)

In [None]:
model = KMeans(n_clusters = 3, init = 'k-means++', random_state=123)
model.fit(df_mc_pca)

In [None]:
df['cluster'] = model.labels_.astype(object)

# 8. Visualization (Plotly)

In [None]:
fig = px.scatter(df, x="ONEOFF_PURCHASES", y="PURCHASES", color="cluster")
fig.show()

In [None]:
fig = px.scatter(df, x='CREDIT_LIMIT', y='PURCHASES', color="cluster")
fig.show()

In [None]:
fig = px.scatter(df, x="BALANCE", y="PURCHASES", color="cluster")
fig.show()

# 9. DBScan

In [None]:
model_db = DBSCAN(eps=0.6, min_samples=10, metric='euclidean')
model_db.fit(df_mc_pca)
df['cluster_DB'] = model_db.labels_.astype(object)+1

In [None]:
fig = px.scatter(df, x="ONEOFF_PURCHASES", y="PURCHASES", color="cluster_DB")
fig.show()

Work in progress....

Please upvote if you like this.