# Data Description
* CUSTID : Identification of Credit Card holder (Categorical)
* BALANCE : Balance amount left in their account to make purchases
* BALANCEFREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
* PURCHASES : Amount of purchases made from account
* ONEOFFPURCHASES : Maximum purchase amount done in one-go
* INSTALLMENTSPURCHASES : Amount of purchase done in installment
* CASHADVANCE : Cash in advance given by the user
* PURCHASESFREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
* ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
* PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
* CASHADVANCEFREQUENCY : How frequently the cash in advance being paid
* CASHADVANCETRX : Number of Transactions made with "Cash in Advanced"
* PURCHASESTRX : Numbe of purchase transactions made
* CREDITLIMIT : Limit of Credit Card for user
* PAYMENTS : Amount of Payment done by user
* MINIMUM_PAYMENTS : Minimum amount of payments made by user
* PRCFULLPAYMENT : Percent of full payment paid by user
* TENURE : Tenure of credit card service for user


# Import Libraries

In [None]:
#import EDA tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#import modeling tools and metrics
from sklearn.cluster import KMeans, SpectralClustering, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture 
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import davies_bouldin_score

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Load Dataset

In [None]:
#load dataset
df=pd.read_csv("../input/ccdata/CC GENERAL.csv")
df

# Data Cleaning

In [None]:
#find missing values
df.isna().sum()

In [None]:
df1 = df.drop('CUST_ID', axis = 1) #dropping Customer ID 
df1.fillna(method ='bfill', inplace = True) #filling the missing values with BFILL method

#df1["MINIMUM_PAYMENTS"].fillna(df1["MINIMUM_PAYMENTS"].mean(), inplace=True)
#df1["CREDIT_LIMIT"].fillna(df1["CREDIT_LIMIT"].mean(), inplace=True)

In [None]:
#Detecting outliers
for i in df1.select_dtypes(include=['float64','int64']).columns:
  max_threshold=df1[i].quantile(0.95)
  min_threshold=df1[i].quantile(0.05)
  df1_no_outlier=df1[(df1[i] < max_threshold) & (df1[i] > min_threshold)].shape
  print(" outlier in ",i,"is" ,int(((df1.shape[0]-df1_no_outlier[0])/df1.shape[0])*100),"%")

In [None]:
#remove outliers from columns having nearly 10% outlier
max_threshold_BALANCE=df1["BALANCE"].quantile(0.95)
min_threshold_BALANCE=df1["BALANCE"].quantile(0.05)
max_threshold_CREDIT_LIMIT=df1["CREDIT_LIMIT"].quantile(0.95)
min_threshold_CREDIT_LIMIT=df1["CREDIT_LIMIT"].quantile(0.05)
max_threshold_PAYMENTS=df1["PAYMENTS"].quantile(0.95)
min_threshold_PAYMENTS=df1["PAYMENTS"].quantile(0.05)
df1_no_outlier=df1[(df1["CREDIT_LIMIT"] < max_threshold_CREDIT_LIMIT) & (df1["CREDIT_LIMIT"] > min_threshold_CREDIT_LIMIT) & (df1["BALANCE"] < max_threshold_BALANCE) & (df1["BALANCE"] > min_threshold_BALANCE) &  (df1["PAYMENTS"] < max_threshold_PAYMENTS) & (df1["PAYMENTS"] > min_threshold_PAYMENTS)]

In [None]:
#Normalizing the Data 
normalized_df = pd.DataFrame(normalize(df1_no_outlier))
normalized_df.head()

In [None]:
#Correlation matrix

fig = plt.subplots(figsize=(15,8))

sns.heatmap(normalized_df.corr(),
            annot=True,
            fmt="0.2f",
            cmap="inferno")

##### Dimension Reduction (PCA)

In [None]:
#appling PCA
pca = PCA() 
pcadf = pca.fit_transform(normalized_df) 
pcadf = pd.DataFrame(pcadf) 

In [None]:
plt.subplots(figsize=(12,6))
plt.plot(pca.explained_variance_ratio_.cumsum())
plt.xticks(np.arange(0,16,1));

* Conclusion : We should choose 2 as n_components

In [None]:
#Setting 2 as n_components
pca = PCA(n_components = 2) 
pcadf = pca.fit_transform(normalized_df) 
pcadf = pd.DataFrame(pcadf) 
pcadf.columns = ['PC1', 'PC2']   

pcadf.head()

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.explained_variance_ratio_.cumsum()

# Train model
### Choosing k as number of clusters

`Elbow method`
* This method helps us to choose the right k for number of clusters

`Silhouette Score`
* This metric can help us to evaluate our performance and find out the best k for number of clusters

In [None]:
#using elbow rule on pcadf (PCA applied)
inertia = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(pcadf)
    inertia.append(kmeanModel.inertia_)
plt.figure(figsize=(15,6))
plt.plot(K, inertia, 'bx-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('The Elbow Method \n(PCA applied)')
plt.show()

In [None]:
#Creating evaluate function to evaluate our models performance
def evaluate(model, data, silhouette_visualizer=False):
  """
  Plotting Silhouette Diagram or Silhouette Visualizer, print Silhouette Score and Davies Bouldin Score to evaluate your model performance.

  Parameters:
  model : pass your estimator with model argument to this function.
  data : data is your dataframe which is going to be trained.
  silhouette_visualizer : defualt is False, you can plot silhouette diagram by passing True. 
  """
  if silhouette_visualizer:
    from yellowbrick.cluster import SilhouetteVisualizer

    visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    visualizer.fit(data)

  y = model.fit_predict(data)


  SC = "Silhouette Score"+" : "+str("{:.2f}".format(silhouette_score(data, y)))
  DBC = "Davies Bouldin Score"+" : "+str("{:.2f}".format(davies_bouldin_score(data, y)))


  print(SC)
  print(DBC)

### KMeans

In [None]:
#training model and evaluation
for i in range(2,11):
  kmeans = KMeans(n_clusters=i, random_state=42)
  print("Number of Clusters : " + str(i))
  evaluate(kmeans, pcadf)
  print("====================")

In [None]:
#plotting silhouette diagram for n_clusters = 3
kmeans = KMeans(n_clusters=3, random_state=42)
evaluate(kmeans, pcadf, silhouette_visualizer=True)

* Conclusion : We can accept the KMeans with 3 as n_clusters because the size of clusters are almost similar.

  Let's try another estimators and choose the best one!

### GaussianMixture

In [None]:
#training model and evaluation
for i in range(2,11):
  gmm = GaussianMixture(n_components=i, random_state=42)
  print("Number of Components : " + str(i))
  evaluate(gmm, pcadf)
  print("====================")

### Spectral Clustering

In [None]:
#training model and evaluation
for i in range(2,11):
  spc = SpectralClustering(n_clusters=i, affinity='rbf') 
  print("Number of Clusters : " + str(i))
  evaluate(spc, pcadf)
  print("====================")

### Agglomerative Clustering

In [None]:
#train model and evaluate the performance
linkages = ["ward", "single", "average"]
for i in range(2,11):
  for lk in linkages:
    hcluster = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage=lk)
    print("Number of Clusters : " + str(i) + "  linkage : "+lk)
    evaluate(hcluster,pcadf)
    print("====================")

The best case is:

Number of Clusters = 3 and linkage = ward

# Clustering Visualization

In [None]:
#create plot model clustering function
def plot_model_clustering(model, data):
  plt.figure(figsize=(15,7))
  model.fit(data)
  labels= model.labels_
  df_label=data.copy()
  df_label['labels']= labels
  ax = sns.scatterplot(x='PC1', y='PC2', hue='labels', data=df_label, palette='bright')
  for index in range(len(str(model))):
    if str(model)[index] == '(':
      ind = index
  ax.set_title(str(model)[:ind])

  evaluate(model, data)

In [None]:
plot_model_clustering(kmeans, pcadf)

In [None]:
hcluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage="ward")
plot_model_clustering(hcluster, pcadf)

### Last Note : Both KMeans and Agglomerative with 3 as number of clusters are good to use however, based on Silhouette Score KMeans is the best one!