In [None]:

import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import os

"""
Following is the Data Dictionary for Credit Card dataset :-

CUSTID : Identification of Credit Card holder (Categorical)
BALANCE : Balance amount left in their account to make purchases 
BALANCEFREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
PURCHASES : Amount of purchases made from account
ONEOFFPURCHASES : Maximum purchase amount done in one-go
INSTALLMENTSPURCHASES : Amount of purchase done in installment
CASHADVANCE : Cash in advance given by the user
PURCHASESFREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
ONEOFFPURCHASESFREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
PURCHASESINSTALLMENTSFREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
CASHADVANCEFREQUENCY : How frequently the cash in advance being paid
CASHADVANCETRX : Number of Transactions made with "Cash in Advanced"
PURCHASESTRX : Numbe of purchase transactions made
CREDITLIMIT : Limit of Credit Card for user
PAYMENTS : Amount of Payment done by user
MINIMUM_PAYMENTS : Minimum amount of payments made by user
PRCFULLPAYMENT : Percent of full payment paid by user
TENURE : Tenure of credit card service for user

We are going to utilise clustering techniques to segment the customers based on their credit card details
"""



A dataset for various credit card users,we can use this dataset to segment different types of customers.

In [None]:
df = pd.read_csv("../input/ccdata/CC GENERAL.csv")
df.head(10)

In [None]:
df.shape


In [None]:
df.describe()

Find all the categorical and non numeric varibles.

In [None]:
df.info()

Only Customer_id is string type,so we will drop it here.

In [None]:
df = df.drop({'CUST_ID'},axis=1)
df.head(10)

Now let us convert all the int types to float types.

In [None]:
datatypes = {}
for column in df.columns:
    datatypes[column] = float
    
df = df.astype(datatypes)    
df.info()    

Now we can see that some null values in the minimum_payments column.We will drop these rows here.

In [None]:
df = df.dropna()
df.info()

We would like to drop the balance frequency(it doesn't relate to customer behaviour) and tenture as it is mostly 12 months.

In [None]:
df = df.drop({'BALANCE_FREQUENCY','TENURE'},axis=1)
df.head(2)

Now we will try to segemnt our customers based on these attributes.

In [None]:
df.mean(axis=0) 

In [None]:
#We will drop the columns with very low mean values here as their significance is very less.
df = df.drop({"PURCHASES_FREQUENCY","ONEOFF_PURCHASES_FREQUENCY","PURCHASES_INSTALLMENTS_FREQUENCY","CASH_ADVANCE_FREQUENCY","PRC_FULL_PAYMENT",    "CASH_ADVANCE_TRX","PURCHASES_TRX"},axis=1)
df.head()

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(df)
X.shape

In [None]:
n_clusters = 25
inertia = []
for n in range(1,n_clusters):
    km = KMeans(n)
    km.fit(X)
    inertia.append(km.inertia_)
        
plt.plot(inertia)    

Let us choose 10 as the number of clusters.

In [None]:
km = KMeans(10)
km.fit(X)
km.inertia_

In [None]:
df_cluster = pd.concat([df, pd.DataFrame({'Cluster': km.labels_})],axis=1)
df_cluster = df_cluster.dropna()
df_cluster.info()
df_cluster.head()


visualisation of clusters.


In [None]:

arr_0 = df_cluster[df_cluster["Cluster"] == 0.0]
arr_1 = df_cluster[df_cluster["Cluster"] == 1.0]
arr_2 = df_cluster[df_cluster["Cluster"] == 2.0]
arr_3 = df_cluster[df_cluster["Cluster"] == 3.0]
arr_4 = df_cluster[df_cluster["Cluster"] == 4.0]
arr_5 = df_cluster[df_cluster["Cluster"] == 5.0]
arr_6 = df_cluster[df_cluster["Cluster"] == 6.0]
arr_7 = df_cluster[df_cluster["Cluster"] == 7.0]
arr_8 = df_cluster[df_cluster["Cluster"] == 8.0]
arr_9 = df_cluster[df_cluster["Cluster"] == 9.0]
plt.figure(figsize=(10,15))
for c in df:
    arr = []
    arr.append(arr_0[c].mean())
    arr.append(arr_1[c].mean())
    arr.append(arr_2[c].mean())
    arr.append(arr_3[c].mean())
    arr.append(arr_4[c].mean())
    arr.append(arr_5[c].mean())
    arr.append(arr_6[c].mean())
    arr.append(arr_7[c].mean())
    arr.append(arr_8[c].mean())
    arr.append(arr_9[c].mean())
    
    plt.plot(arr,label = c)
    plt.xlabel("Clusters")
    plt.ylabel("Mean_values")
    plt.legend(loc=2,prop = {'size':8})
    
    
    
    
    
    
    

In [None]:
"""
Trends: Across all clusters.
We can see that credit_card limit(brown) and balances(blue) are directly linked.
The purchases(orange) and cash_advance(violet) are directly linked.
No clear trends between payments(pink) and purchases(orange).
Installments purchases(red) are not linked with balance or purchases,but are inversely linked with oneoff purchases(green).
"""

In [None]:
scaler = StandardScaler()
X_PCA = scaler.fit_transform(df)
X_PCA

n_dim = 7
explained_variance  = []
for n in range(1,n_dim):
    pca = PCA(n)
    pca.fit_transform(X_PCA)
    explained_variance.append(pca.explained_variance_ratio_.sum())
    
plt.plot(explained_variance)    

We will number of dimensions as 6 here.


In [None]:
pca = PCA(4)
X_PCA = pca.fit_transform(X_PCA)
pca.explained_variance_ratio_


In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
n_clusters = 25
inertia = []
for n in range(1,n_clusters):
    km = KMeans(n)
    km.fit_transform(X_PCA)
    inertia.append(km.inertia_)
    
plt.plot(inertia)

Again we are going to choose the value 10 as the number of clusters.

In [None]:
km = KMeans(10)
km.fit_transform(X_PCA)
km.inertia_

Here we see a substantial drop in inertia as compared to using 16 dimensions as in the dataset.

In [None]:
df_cluster_1 = pd.concat([df,pd.DataFrame({"Cluster_PCA": km.labels_})],axis=1)
df_cluster_1 = df_cluster_1.dropna()
df_cluster_1.head()

Mean values of each attriubute with the number of clusters used.

In [None]:

arr_0 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 0.0]
arr_1 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 1.0]
arr_2 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 2.0]
arr_3 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 3.0]
arr_4 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 4.0]
arr_5 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 5.0]
arr_6 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 6.0]
arr_7 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 7.0]
arr_8 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 8.0]
arr_9 = df_cluster_1[df_cluster_1["Cluster_PCA"] == 9.0]
plt.figure(figsize=(10,15))
for c in df:
    arr = []
    arr.append(arr_0[c].mean())
    arr.append(arr_1[c].mean())
    arr.append(arr_2[c].mean())
    arr.append(arr_3[c].mean())
    arr.append(arr_4[c].mean())
    arr.append(arr_5[c].mean())
    arr.append(arr_6[c].mean())
    arr.append(arr_7[c].mean())
    arr.append(arr_8[c].mean())
    arr.append(arr_9[c].mean())
    
    plt.plot(arr,label = c)
    plt.xlabel("Clusters")
    plt.ylabel("Mean_values")
    plt.legend(loc=2,prop = {'size':8})
    
    
    

In [None]:
"""
We see the same trends here as above.
Only the correlations seem much sharper.
Also here purchases(pink) is much more correlated with balance(blue) and purchases(orange) here.
"""

Thus PCA improves the inertia of the kmeans clustering. 