# **Credit Card Clustering | DBSCAN & KMeans & PCA (PCA used for visualization)**
# The example is based on https://www.kaggle.com/code/youssefelbadry10/credit-card-clustering-dbscan-kmeans-pca/notebook

# Credit Card Dataset for Clustering is taken from https://www.kaggle.com/code/salomopolanco/data-mining-clustering

CUST_ID : Identification of Credit Card holder (Categorical)  

BALANCE : Balance amount left in their account to make purchases  

BALANCE_FREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)

PURCHASES : Amount of purchases made from account

ONEOFF_PURCHASES : Maximum purchase amount done in one-go

INSTALLMENTS_PURCHASES : Amount of purchase done in installment

CASH_ADVANCE : Cash in advance given by the user

PURCHASES_FREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)

ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)

PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)

CASHADVANCEFREQUENCY : How frequently the cash in advance being paid

CASHADVANCETRX : Number of Transactions made with "Cash in Advanced"

PURCHASES_TRX : Numbe of purchase transactions made

CREDIT_LIMIT : Limit of Credit Card for user

PAYMENTS : Amount of Payment done by user

MINIMUM_PAYMENTS : Minimum amount of payments made by user

PRCFULLPAYMENT : Percent of full payment paid by user

TENURE : Tenure of credit card service for user

# **Import Libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

In [None]:
import matplotlib.pyplot as plt 
import mpl_toolkits.mplot3d as Axes3D
import seaborn as sns 
import plotly.express as px 

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN
import scipy.cluster.hierarchy as sch 
from sklearn.decomposition import PCA 
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

import warnings 
warnings.filterwarnings("ignore")


# **Preprocessing & Exploration**

In [None]:
df = pd.read_csv("CC GENERAL.csv")
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['MINIMUM_PAYMENTS'] = df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].mean())
df['CREDIT_LIMIT']=df['CREDIT_LIMIT'].fillna(df['CREDIT_LIMIT'].mean())

In [None]:
df = df.drop('CUST_ID', axis=1)

## **Box-Plots**
### To detect Outliers

In [None]:
def plot_boxplots_alternative(df):
    num_columns = df.select_dtypes(include='number').columns
    num_features = len(num_columns)
    rows = (num_features // 3) + 1
    cols = min(num_features, 3)
    fig, axes = plt.subplots(rows, cols, figsize=(20, rows * 6))
    axes = axes.flatten()  # Flatten in case of more subplots
    
    for i, col in enumerate(num_columns):
        axes[i].boxplot(df[col], vert=False, patch_artist=True,
                        boxprops=dict(facecolor='lightblue', color='navy'),
                        medianprops=dict(color='red'))
        axes[i].set_title(f'Boxplot of {col}', fontsize=14)
        axes[i].set_xlabel(col)

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

plot_boxplots_alternative(df)

## Distributions

In [None]:
plt.figure(figsize=(20,35))
for i, col in enumerate(df.columns):
    if df[col].dtype != 'object':
        ax = plt.subplot(9, 2, i+1)
        sns.kdeplot(df[col], ax=ax)
        plt.xlabel(col)
        
plt.show()

# **K-means Clustring With PCA**

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

In [None]:
def plot_elbow_method(scaled_data):
    inertia = [KMeans(n_clusters=k, random_state=0).fit(scaled_data).inertia_ for k in range(1, 11)]
    plt.figure(figsize=(8, 6))
    plt.plot(range(1, 11), inertia, 'bo-', markersize=8, color='royalblue')
    plt.xlabel('Number of clusters', fontsize=12)
    plt.ylabel('Inertia', fontsize=12)
    plt.title('Elbow Method for Optimal k', fontsize=14)
    plt.grid(True)
    plt.show()

plot_elbow_method(scaled_data)

In [None]:
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=0)
clusters = kmeans.fit_predict(scaled_data)
df_finalresult = pd.DataFrame(df)
df_finalresult['kmeans_cluster'] = clusters

inertia = kmeans.inertia_
print(f"K-Means Inertia for {optimal_k} clusters: {inertia}")
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

In [None]:
plt.figure(figsize=(10, 6))
scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='coolwarm', alpha=0.6)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('K-Means Clustering Results with PCA')
legend_labels = [f'Cluster {i}' for i in range(optimal_k)]
handles, _ = scatter.legend_elements()
plt.legend(handles, legend_labels, title="Clusters")
plt.grid()
plt.show()

In [None]:
cluster_df = pd.DataFrame(df)
cluster_df['clusters'] = clusters
cluster_df.head(10)

In [None]:
# see how good each value is assign to its cluster
silhouette_scores = []
K = range(1, 11)
for k in K[1:]:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(scaled_data)
    score = silhouette_score(scaled_data, kmeans.labels_)
    silhouette_scores.append(score)

plt.plot(K[1:], silhouette_scores, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method')
plt.show()

### The best Silhouette score is achieved when K = 3

In [None]:
optimal_k2 = 3
kmeans2 = KMeans(n_clusters=optimal_k2, random_state=0)
clusters2 = kmeans2.fit_predict(scaled_data)
df_finalresult = pd.DataFrame(df)
df_finalresult['kmeans_cluster'] = clusters2

inertia = kmeans2.inertia_
print(f"K-Means Inertia for {optimal_k2} clusters: {inertia}")
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

In [None]:
plt.figure(figsize=(10, 6))
scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='coolwarm', alpha=0.6)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('K-Means Clustering Results with PCA')
legend_labels = [f'Cluster {i}' for i in range(optimal_k2)]
handles, _ = scatter.legend_elements()
plt.legend(handles, legend_labels, title="Clusters")
plt.grid()
plt.show()

# **DBSCAN (Density-Based Spatial Clustering of Applications with Noise)**

In [None]:
eps = 0.2
min_samples = 15

# DBSCAN
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan_labels = dbscan.fit_predict(scaled_data)
df_finalresult = pd.DataFrame(df)
df_finalresult['dbscan_cluster'] = dbscan_labels

# Perform PCA 
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

plt.figure(figsize=(10, 6))
scatter = plt.scatter(pca_result[:, 0], pca_result[:, 1], c=dbscan_labels, cmap='coolwarm', alpha=0.4, s=0.3)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('DBSCAN Clustering Results with PCA')
unique_labels = set(dbscan_labels)
legend_labels = [f"Cluster {label}" if label != -1 else "Outliers" for label in unique_labels]
handles, _ = scatter.legend_elements()
plt.legend(handles, legend_labels, title="Clusters")
plt.grid()
plt.show()