In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn.preprocessing as pp
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.metrics import silhouette_score
import seaborn as sns
import scipy.cluster.hierarchy as shc
import nbformat
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Abstract
Clustering is an effective technique used to find logical groupings in data. For this paper I will explore at length the k-means clustering approach in an effort to identify customer segments in credit card data. I will then use the BIRCH & Agglomerative clustering approaches to compare their treatment of the data with the k-means treatment of the data. I will then close with condensed version of some lessons learned while putting this paper together.

# Introduction
This document contains a cursory look at clustering techniques. In this document, I attempt to use various clustering techniques to identify customer “clusters” in consumer credit card data while observing how each technique handles the data set and how the results may be similar or dissimilar. This document provides a walk-thru of the data treatment and methods used to discern the number of clusters. The purpose of this analysis is to identify different customer personas in the credit card data.

In this document, I walk through an analysis of the data with a K-Means clustering approach complete with an interpretation of the data clusters. Following that analysis is a general comparison of how the K Means, Birch, and Agglomerative clustering approaches differ in their treatment of the data.

The data for this document is a public dataset of consumer credit card data published on Kaggle [1]. The dataset is accompanied by several user-submitted kernels that provide some starting points and insight for this analysis [2]. For this document, I have noted the most useful kernels in the references section of this document. My work differs in that it combines several approaches not contained in a single kernel. My treatment of the data and analysis of the results also varies significantly from any single kernel published on Kaggle. My analysis does not align with other approaches. My steps to prepare the data, identify the number of clusters and interpretations of the results will, in some cases, differ significantly. I also provide details concerning my data preparation steps not shared in the public kernels. Additionally, my treatment of the data provides a direct comparison of the output of the various clustering approaches.

# Data
The consumer credit card data consists of 8950 rows with 18 columns. 

In [None]:
#import data
raw_data = pd.read_csv("/kaggle/input/ccdata/CC GENERAL.csv")

In [None]:
#explore and prep data
print(raw_data.shape) # rows and columns
raw_data.head() # first 5 records

# Data Preperation

Columns from this point will be referenced as features. The data contained 314 null values that were imputed with mean values from their respective columns. MIMIMUM_PAYMENTS had 313 null values. Null values imputed with 864.21. CREDIT_LIMIT had 1 null value and the null value was imputed with 4494.45.

For this analysis I chose to use all of the features minus the CUST_ID feature. The CUST_ID is specific to an individual and does not help with this analysis. However, the CUST_ID could provide a connection to more demographic information that could help create better clusters and better customer segmentation.

I did explore trying to reduce the number of features by removing all features with low variance and features with a high level of correlation. The approach did reduce 7 additional features, but the reduction of the features made interpreting the clusters less intuitive in the end; or at least harder to express in terms of customer segments.

In [None]:
#fill null/na values with mean values for minimum payments
raw_data['MINIMUM_PAYMENTS'].fillna(value=raw_data['MINIMUM_PAYMENTS'].mean(), inplace = True)

#fill null/na values with mean values for credit limit
raw_data['CREDIT_LIMIT'].fillna(value=raw_data['CREDIT_LIMIT'].mean(), inplace = True)

#count null values in columns 
#fill null values with mean values for minimum 
raw_data.isnull().sum().sort_values(ascending=False)

# drop cust_id from raw data
raw_data = raw_data.drop('CUST_ID', axis = 1) 

In [None]:
# describe data
raw_data.describe()

In [None]:
#Low Variance Filter
var = raw_data.var() # Variance 
cols = raw_data.columns #columns 
variable1 = [ ] 
for i in range(0,len(var)):
    if var[i]>=10:   #setting the threshold as 10%
       variable1.append(cols[i+1])
    
print(variable1)

filtered_data = raw_data[['BALANCE_FREQUENCY', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT']].copy()

filtered_data.info()

# Data Preperation for K-Means Clustering

For K-Means clustering, the algorithm works best with variables with normalized data with the same variance and standard deviations. As an initial check, I investigated the skewness of a subset of my variables. I found that most (if not all) of the features not normalized between 0 and 1 are skewed.

Based on my analysis of skewness, I then log-transformed the data. I then evaluated the mean and standard deviations of my data. The data showed that I still needed to center my data. To center my data, I used klearn.preprocessing.StandardScaler on my data. This process has the effect of subtracting the mean values from each observation in each feature[3].

The final step was to normalize my data. Normalizing my data had the impact of keeping my means and std the uniform across the data set but scales my data to have unit norm [4].

I did not explicitly address outliers in my data set.

I also chose to re-use this dataset as prepared for the other clustering techniques applied to this data for consistency. I also assume that since these approaches are all measured clustering approaches that the data preparation should transfer. In a real-world application, my assumption would need to be validated.

In [None]:
figure = plt.figure(figsize=(16, 9))
raw_PURCHASES = figure.add_subplot(2,4,1) 
log_PURCHASES = figure.add_subplot(2,4,2) #silhouette bar chart
raw_BALANCE = figure.add_subplot(2,4,3) #silhouette bar chart
log_BALANCE = figure.add_subplot(2,4,4) #silhouette bar chart
raw_CREDIT_LIMIT = figure.add_subplot(2,4,5) #silhouette bar chart
log_CREDIT_LIMIT = figure.add_subplot(2,4,6) #silhouette bar chart
raw_MINIMUM_PAYMENTS = figure.add_subplot(2,4,7) #silhouette bar chart
log_MINIMUM_PAYMENTS = figure.add_subplot(2,4,8) #silhouette bar chart

sns.distplot(raw_data['PURCHASES'], axlabel='Purchases Raw Data', ax=raw_PURCHASES)

purchases_log=np.log(1 + raw_data['PURCHASES'])
sns.distplot(purchases_log, axlabel='Purchases Log Data', ax=log_PURCHASES)

sns.distplot(raw_data['BALANCE'], axlabel='Balance Raw Data', ax=raw_BALANCE)

purchases_log=np.log(1 + raw_data['BALANCE'])
sns.distplot(purchases_log, axlabel='Balance Log Data', ax=log_BALANCE)

sns.distplot(raw_data['CREDIT_LIMIT'], axlabel='Credit Limit Raw Data', ax=raw_CREDIT_LIMIT)

purchases_log=np.log(1 + raw_data['CREDIT_LIMIT'])
sns.distplot(purchases_log, axlabel='Credit Limit Log Data', ax=log_CREDIT_LIMIT)

sns.distplot(raw_data['MINIMUM_PAYMENTS'], axlabel='Minimum Payments Raw Data', ax=raw_MINIMUM_PAYMENTS)

purchases_log=np.log(1 + raw_data['MINIMUM_PAYMENTS'])
sns.distplot(purchases_log, axlabel='Minimum Payments Log Data', ax=log_MINIMUM_PAYMENTS)

figure.tight_layout()

Many of the features have values that are either 1 or 0. However the features that deal with dollar figures vary quite a bit. I chose to log transform these values to reduce the scale into a normal distribution. It also helps with scaling and grouping the data when analyzing the clusters. Having the effect of grouping into a discernable range.

The final step in preparing the data was transforming the data with the sklearn.preprocessing.StandardScaler and sklearn.preprocessing.Normalizing. 

In [None]:
# Log-transformation

cols =  ['BALANCE',
         'PURCHASES',
         'ONEOFF_PURCHASES',
         'INSTALLMENTS_PURCHASES',
         'CASH_ADVANCE',
         'CASH_ADVANCE_TRX',
         'PURCHASES_TRX',
         'CREDIT_LIMIT',
         'PAYMENTS',
         'MINIMUM_PAYMENTS',
        ]

# Note: Adding 1 for each value to avoid inf values
raw_data[cols] = np.log(1 + raw_data[cols])

raw_data.head()

In [None]:
# Standardize data
scaler = pp.StandardScaler() 
scaled_df = scaler.fit_transform(raw_data) 
  
# Normalizing the Data 
normalized_df = pp.normalize(scaled_df) 
  
# Converting the numpy array into a pandas DataFrame 
normalized_df = pd.DataFrame(normalized_df) 

normalized_df.describe()

# Clustering Approaches

Cluster analysis or clustering is the task of grouping a set of objects in such a way that objects in the same group (called a cluster) are more similar (in some sense) to each other than to those in other groups (clusters). It is a main task of exploratory data mining, and a common technique for statistical data analysis, used in many fields, including machine learning, pattern recognition, image analysis, information retrieval, bioinformatics, data compression, and computer graphics[5].

For this data set and this analysis, we are attempting to cluster credit card data to identify customer segments based on their credit card transaction history. 
 
The first approach in attempting to cluster the data is the K-Means clustering approach complete with an interpretation of the data clusters. Following that analysis is a general comparison of how the K Means, Birch, and Agglomerative clustering approaches differ in their treatment of the data.

## K-Means Clustering
k-means clustering is a method of vector quantization, originally from signal processing, that is popular for cluster analysis in data mining. k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster.[6]

To identify a general estimate of the number of clusters to consider for the k-means approach, I created an elbow chart using the Elbow method. The Elbow method is a heuristic method of interpretation and validation of consistency within cluster analysis designed to help finding the appropriate number of clusters in a dataset. It is often ambiguous and not very reliable, and hence other approaches for determining the number of clusters such as the Silhouette method are preferable. [7]

I then evaluated the silhouette scores for a range of cluster options based on the elbow chart; figure 1. The silhouette measures how similar a point is in the cluster compared to other clusters in a range of -1 to 1. [8]

In [None]:
figure = plt.figure(figsize=(16, 9))
elbow = figure.add_subplot(1,2,1) #elbow chart
kmean_sil = figure.add_subplot(1,2,2) #silhouette bar chart

n_clusters=10
cost=[]
for i in range(1,n_clusters):
    kmean= KMeans(i)
    kmean.fit(normalized_df)
    cost.append(kmean.inertia_) 
    elbow.set_ylabel('Sum of Squared Errors', fontsize = 15)
    elbow.set_xlabel('Number of Clusters', fontsize = 15)
    elbow.set_title('K-MEANS Clustering SSE: Elbow Chart', fontsize = 15)
    elbow.plot(cost, 'bx-')


silhouette_scores = [] 

for n_cluster in range(2, 8):
    silhouette_scores.append( 
        silhouette_score(normalized_df, KMeans(n_clusters = n_cluster).fit_predict(normalized_df))) 
    
# Plotting a bar graph to compare the results 
k = [2, 3, 4, 5, 6,7] 
kmean_sil.bar(k, silhouette_scores) 
kmean_sil.set_title('K-MEANS: Number of Clusters vs. Silhouette Score', fontsize = 15)
kmean_sil.set_xlabel('Number of Clusters', fontsize = 15) 
kmean_sil.set_ylabel('Silhouette Score', fontsize = 15) 

The elbow chart appears to recommend somewhere between 3 and 5 potential clusters. The silhouette score for 2 clusters is the highest with the rest of the scores from 3 to 7 clusters being very close to equal. 

In the effort to pick the ideal number of clusters, we can use the mathematical approaches outlined in the charts. Additionally, in making this decision having expertise with data and the domain it represents is very beneficial in the analysis process. In this case, I would like to see some variety without trying to spread the analysis too thin. I am interested in more general groupings than I am in finding niche groups in the fictional customer segment. For this analysis and the rest of this effort, I did my work with 3 clusters.

The number of features associated with this data makes seeing the data very difficult. After trying several methods, I landed on snake plots and heatmaps as the most useful visuals to compare the clusters holistically.

In [None]:
kmean= KMeans(3)
kmean.fit(normalized_df)
labels=kmean.labels_
clusters=pd.concat([raw_data, pd.DataFrame({'cluster':labels})], axis=1)

In [None]:
datamart_melt = pd.melt(clusters.reset_index(),
id_vars=['cluster'],
value_vars=['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
            'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 
            'CASH_ADVANCE', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
            'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY', 
            'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS', 
            'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE'],
var_name='ATTRIBUTES',
value_name='VALUES')

plt.figure(figsize=(24, 9))
chart = sns.lineplot(x="ATTRIBUTES", y="VALUES", hue='cluster', data=datamart_melt)
plt.title('Snake Plot of Standardized Attributes')
plt.xticks(rotation=75)
plt.legend(loc='upper left', fontsize='large')

figure.tight_layout()

The relative importance of each feature is calculated as the mean of the clusters divided by the mean of the population mean minus 1. This calculation gives you a number relative to the cluster mean. The further the values is from 0 the more significant that feature is in relation to the total population [5]

In [None]:
cluster_avg = clusters.groupby(['cluster']).mean()
population_avg = clusters.mean()
population_avg = population_avg.drop('cluster')
relative_imp = cluster_avg/population_avg
relative_imp.sort_values(by=['cluster'],inplace=True)
relative_imp.round(2).unstack()

plt.figure(figsize=(20, 8))
plt.title('Relative Importance of Attributes')
sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='RdYlGn', yticklabels='auto')
plt.ylim(0,3)
plt.xlabel('Features')
plt.ylabel('Clusters')
figure.tight_layout()
plt.savefig('kmeans_heat.png')
plt.show()

In [None]:
for c in clusters:
    grid= sns.FacetGrid(clusters, col='cluster')
    grid.map(plt.hist, c)    
    plt.show()

## K-Means Interpretation

Based on the data I would classify the three customer segments as follows:

Cluster 0: Uses their credit card frequently for daily purchases and whatever catches their eye. Their credit card is used for daily spending with the accumulation of a balance they start paying on quickly but may be working on paying off that debt in installments. They carry an average to above average credit limit. They don’t use cash advances. 

Cluster 1: Is the reluctant credit card user with the preference for cash advances when necessary. Maybe using cash to get out of a jam as they only transact very infrequently with only one-off purchases via cash advance. They will then primarily only pay the minimum payments towards paying down their debt. 

Cluster 3: Everyday credit card user for daily items. This segment of users their credit much less often than cluster 0 and does not use cash advances. Their credit limit leans more average than high and they lead the pack in minimum payments. 

## Compared to K-Means: BIRCH & Agglomerative Clustering

As part of my analysis, I also compared the k-means approach and its handling of the data with BIRCH and Agglomerative Clustering approaches. 

BIRCH (balanced iterative reducing and clustering using hierarchies) is an unsupervised data mining algorithm used to perform hierarchical clustering over particularly large data-sets.[1] An advantage of BIRCH is its ability to incrementally and dynamically cluster incoming, multi-dimensional metric data points in an attempt to produce the best quality clustering for a given set of resources (memory and time constraints). In most cases, BIRCH only requires a single scan of the database. [9]

Agglomerative clustering is another form of hierarchical clustering. This approach is a form of “bottom up” clustering “…each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy.[10] 

All of the approaches seem to have a clear preference for 2 clusters. Based on the analysis of the K-Means results, I could see those two segments representing the extremes based (in a general sense, not the actual features) on the total spend in association with purchase frequency,  cash advances and the number and the amount of money spent on one-off purchases.
 
Where the results get a little more interesting is that the K-Means and BIRCH have a strong preference for several more clusters, 6 and 7 respectfully. If I were looking for more considerable variation and possibly looking for niche customers segments that are not as obvious, pursuing the analysis of 6 to 7 clusters might provide that insight.

Per the snake plots it is very easy to see that all three approaches had a similar outcome and treatment of the data. The clusters are not mapped one to one but the visual is pretty telling.

In [None]:
figure = plt.figure(figsize=(21, 7))
kmean_sil = figure.add_subplot(1,3,1) #kmean silhouette bar chart
birch_sil = figure.add_subplot(1,3,2) #birch chart
agg_sil = figure.add_subplot(1,3,3) #AgglomerativeClustering chart

kmean_silhouette_scores = [] 
birch_silhouette_scores = [] 
agg_silhouette_scores = [] 

for n_cluster in range(2, 8):
    kmean_silhouette_scores.append( 
        silhouette_score(normalized_df, KMeans(n_clusters = n_cluster).fit_predict(normalized_df))) 
    
    birch_silhouette_scores.append( 
        silhouette_score(normalized_df, Birch(n_clusters = n_cluster).fit_predict(normalized_df))) 
        
    agg_silhouette_scores.append( 
        silhouette_score(normalized_df, AgglomerativeClustering(n_clusters = n_cluster).fit_predict(normalized_df))) 
    
# Plotting a bar graph to compare the results 
k = [2, 3, 4, 5, 6,7] 
kmean_sil.bar(k, kmean_silhouette_scores) 
kmean_sil.set_title('K-MEANS: Number of Clusters vs. Silhouette Score', fontsize = 10)
kmean_sil.set_xlabel('Number of Clusters', fontsize = 20) 
kmean_sil.set_ylabel('Silhouette Score', fontsize = 20) 

birch_sil.bar(k, birch_silhouette_scores) 
birch_sil.set_title('BIRCH: Number of Clusters vs. Silhouette Score', fontsize = 10)
birch_sil.set_xlabel('Number of Clusters', fontsize = 20) 
birch_sil.set_ylabel('Silhouette Score', fontsize = 20) 
  
agg_sil.bar(k, agg_silhouette_scores) 
agg_sil.set_title('Agglomerative: Number of Clusters vs. Silhouette Score', fontsize = 10)
agg_sil.set_xlabel('Number of Clusters', fontsize = 20) 
agg_sil.set_ylabel('Silhouette Score', fontsize = 20)


figure.tight_layout()

## BIRCH Clustering

In [None]:
birch_cluster= Birch(n_clusters=3)
birch_cluster.fit(normalized_df)
labels=birch_cluster.labels_

birch_clusters=pd.concat([raw_data, pd.DataFrame({'cluster':labels})], axis=1)
birch_clusters.head()

In [None]:
# Prep data for snake plot
datamart_melt = pd.melt(birch_clusters.reset_index(),
id_vars=['cluster'],
value_vars=['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
            'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 
            'CASH_ADVANCE', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
            'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY', 
            'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS', 
            'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE'],
var_name='ATTRIBUTES',
value_name='VALUES')

#snakeplot
plt.figure(figsize=(24, 9))
chart = sns.lineplot(x="ATTRIBUTES", y="VALUES", hue='cluster', data=datamart_melt)
plt.title('BIRCH: Snake Plot of Standardized Attributes')
plt.xticks(rotation=75)
plt.legend(loc='upper left', fontsize='large')

figure.tight_layout()

## Relative Importance Heatmap
plt.figure(figsize=(20, 8))
plt.title('BIRCH: Relative Importance of Attributes')
sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='RdYlGn', yticklabels='auto')
plt.ylim(0,3)
plt.xlabel('Features')
plt.ylabel('Clusters')
figure.tight_layout()
plt.savefig('kmeans_heat.png')
plt.show()

## Agglomerative Clustering

In [None]:
# model fit with 3 clusters
agg_cluster= AgglomerativeClustering(n_clusters=3)
agg_cluster.fit(normalized_df)
agg_clusters=pd.concat([raw_data, pd.DataFrame({'cluster':labels})], axis=1)

# data prep for snake plot
datamart_melt = pd.melt(agg_clusters.reset_index(),
id_vars=['cluster'],
value_vars=['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
            'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 
            'CASH_ADVANCE', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
            'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY', 
            'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS', 
            'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE'],
var_name='ATTRIBUTES',
value_name='VALUES')

# Snakeplot
plt.figure(figsize=(24, 9))
chart = sns.lineplot(x="ATTRIBUTES", y="VALUES", hue='cluster', data=datamart_melt)
plt.title('Agglomerative: Snake Plot of Standardized Attributes')
plt.xticks(rotation=75)
plt.legend(loc='upper left', fontsize='large')

figure.tight_layout()

# relative importance heatmap
plt.figure(figsize=(20, 8))
plt.title('Agglomerative: Relative Importance of Attributes')
sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='RdYlGn', yticklabels='auto')
plt.ylim(0,3)
plt.xlabel('Features')
plt.ylabel('Clusters')
figure.tight_layout()
plt.savefig('Agglomerative_heat.png')
plt.show()

### References

1. https://www.kaggle.com/arjunbhasin2013/ccdata 
2. https://www.kaggle.com/arjunbhasin2013/ccdata/kernels 
3. https://www.edupristine.com/blog/k-means-algorithm 
4. https://en.wikipedia.org/wiki/BIRCH 
5. https://scikit-learn.org/stable/modules/preprocessing.html 
6. https://inseaddataanalytics.github.io/INSEADAnalytics/CourseSessions/Sessions45/ClusterAnalysisReading.html
7. https://en.wikipedia.org/wiki/Hierarchical_clustering