# Credit Card Clustering and Segmentation

## This case requires to develop a customer segmentation to define marketing strategy. The sample Dataset summarizes the usage behavior of about 9000 active credit card holders during the last 6 months. The file is at a customer level with 18 behavioral variables.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# used to supress display of warnings
import warnings

# os is used to provide a way of using operating system dependent functionality
# We use it for setting working folder
import os

# Pandas is used for data manipulation and analysis
import pandas as pd 

# Numpy is used for large, multi-dimensional arrays and matrices, along with mathematical operators on these arrays
import numpy as np

# Matplotlib is a data visualization library for 2D plots of arrays, built on NumPy arrays 
# and designed to work with the broader SciPy stack
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import pyplot

# Seaborn is based on matplotlib, which aids in drawing attractive and informative statistical graphics.
import seaborn as sns


## Scikit-learn features various classification, regression and clustering algorithms
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import average_precision_score, confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix


import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from sklearn.cluster import KMeans 
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import homogeneity_score, completeness_score, \
v_measure_score, adjusted_rand_score, adjusted_mutual_info_score, silhouette_score
%matplotlib inline


In [None]:
# suppress display of warnings
warnings.filterwarnings('ignore')

# display all dataframe columns
pd.options.display.max_columns = 50

# to set the limit to 3 decimals
pd.options.display.float_format = '{:.7f}'.format

# display all dataframe rows
pd.options.display.max_rows = None

### Check a few observations and get familiar with the data

In [None]:
df = pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')
df.head()

### Check the size and info of the data set.

In [None]:
df.shape

- 8950 rows and 18 columns

In [None]:
df.info()

### Check for missing values. Impute the missing values if there is any.

In [None]:
df.isnull().sum()

'MINIMUM_PAYMENTS' and 'CREDIT_LIMIT' have null values

In [None]:
#imputing null values

df['CREDIT_LIMIT'].fillna((df['CREDIT_LIMIT'].median()), inplace=True)
df['MINIMUM_PAYMENTS'].fillna((df['MINIMUM_PAYMENTS'].median()), inplace=True)

### Drop unnecessary columns

In [None]:
df = df.drop('CUST_ID', axis=1)

### Check correlation among features and comment your findings

In [None]:
#Function to plot correlation matrix

def correlation_plot(df):
    
    corr = abs(df.corr()) # correlation matrix
    lower_triangle = np.tril(corr, k = -1)  # select only the lower triangle of the correlation matrix
    mask = lower_triangle == 0  # to mask the upper triangle in the following heatmap

    plt.figure(figsize = (15,10))  # setting the figure size
    sns.set_style(style = 'white')  # Setting it to white so that we do not see the grid lines
    sns.heatmap(lower_triangle, center=0.5, cmap= 'Blues', xticklabels = corr.index,
                yticklabels = corr.columns,cbar = False, annot= True, linewidths= 1, mask = mask)   # Da Heatmap
    plt.show()
    
correlation_plot(df)

### Check distribution of features and comment your findings

In [None]:
#univariate analysis

position = 1
plt.figure(figsize=(15,45))

for column in ['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
       'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
       'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
       'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
       'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
       'TENURE']:    
    plt.subplot(9,2,position)
    if column in ['TENURE']:
        sns.countplot(df[column])
    else:
        sns.distplot(df[column])
    plt.title(column)        
    position += 1
plt.show() 

### Standardize the data using appropriate methods

In [None]:
#normalize the dataset
scaler = StandardScaler()
data = scaler.fit_transform(df) # scaling the data

### Build a k-means algorithm for clustering credit card data

### Build k means model on various k values and plot the inertia against various k values

In [None]:
sse = {}
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(data)
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

### Evaluate the model using Silhouette coefficient

In [None]:
silhouette_scores = [] 

for n_cluster in range(2, 8):
    silhouette_scores.append( 
        silhouette_score(data, KMeans(n_clusters = n_cluster).fit_predict(data))) 
    
# Plotting a bar graph to compare the results 
k = [2, 3, 4, 5, 6,7] 
plt.bar(k, silhouette_scores) 
plt.xlabel('Number of clusters', fontsize = 10) 
plt.ylabel('Silhouette Score', fontsize = 10) 
plt.show() 

### Plot an elbow plot to find the optimal value of k

In [None]:
ks = range(1, 10)
inertias = [] # initializing an empty array

for k in ks:
    model = KMeans(n_clusters=k)
    model.fit(data)
    inertias.append(model.inertia_)

plt.figure(figsize=(8,5))
plt.style.use('bmh')
plt.plot(ks, inertias, '-o')
plt.xlabel('Number of clusters, k')
plt.ylabel('Inertia')
plt.xticks(ks)
plt.show()

### Which k value gives the best result?

- k = 3 gives the highest silhouette score and also comparable inertia value

### Apply PCA to the dataset and repeat the above steps on the new features generated using PCA.

In [None]:
#variance explained with the number of features
pca = PCA(random_state=123)
pca.fit(data)
features = range(pca.n_components_)

plt.figure(figsize=(8,4))
plt.bar(features[:15], pca.explained_variance_[:15], color='lightskyblue')
plt.xlabel('PCA feature')
plt.ylabel('Variance')
plt.xticks(features[:15])
plt.show()

In [None]:
# Applying PCA

from sklearn.decomposition import PCA

# Reducing the dimensions of the data 
pca = PCA(n_components = 2) 
x_principal = pca.fit_transform(df) 
x_principal = pd.DataFrame(x_principal) 
x_principal.columns = ['P1', 'P2'] 
  
x_principal.head()

In [None]:
sse = {}
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(x_principal)
    sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

In [None]:
silhouette_scores = [] 

for n_cluster in range(2, 8):
    silhouette_scores.append( 
        silhouette_score(x_principal, KMeans(n_clusters = n_cluster).fit_predict(x_principal))) 
    
# Plotting a bar graph to compare the results 
k = [2, 3, 4, 5, 6,7] 
plt.bar(k, silhouette_scores) 
plt.xlabel('Number of clusters', fontsize = 10) 
plt.ylabel('Silhouette Score', fontsize = 10) 
plt.show()

### Create a new column as a cluster label in the original data frame and perform cluster analysis. Check the correlation of cluster labels with various features and mention your inferences

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(df)
df['Cluster'] = y_kmeans

In [None]:
df.head()

In [None]:
#Function to plot correlation matrix

def correlation_plot(df):
    
    corr = abs(df.corr()) # correlation matrix
    lower_triangle = np.tril(corr, k = -1)  # select only the lower triangle of the correlation matrix
    mask = lower_triangle == 0  # to mask the upper triangle in the following heatmap

    plt.figure(figsize = (15,10))  # setting the figure size
    sns.set_style(style = 'white')  # Setting it to white so that we do not see the grid lines
    sns.heatmap(lower_triangle, center=0.5, cmap= 'Blues', xticklabels = corr.index,
                yticklabels = corr.columns,cbar = False, annot= True, linewidths= 1, mask = mask)   # Da Heatmap
    plt.show()
    
correlation_plot(df)

In [None]:
#Cluster Analysis

df1 = df.values
plt.scatter(df1[y_kmeans == 0][:,0], df1[y_kmeans == 0][:,12], c = 'red', label = 'Cluster 1')
plt.scatter(df1[y_kmeans == 1][:,0], df1[y_kmeans == 1][:,12], c = 'blue', label = 'Cluster 2')
plt.scatter(df1[y_kmeans == 2][:,0], df1[y_kmeans == 2][:,12], c = 'green', label = 'Cluster 3')

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Credit card Clustering')
plt.xlabel('Balance')
plt.ylabel('Credit Limit')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(5,8))

pp = sns.pairplot(data=df,
                  y_vars=['Cluster'],
                  x_vars=['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
       'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
       'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
       'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
       'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
       'TENURE'])
plt.show()

#save plot
pp.savefig('pp.png')

We can see that our Inertia and silhoutte scores drastically improve after doing Principal Component Analysis with n_component = 2 along with K means algorithm