In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')
print(df.shape)
df.head()

Preprocessing the data first.

In [None]:
df.describe()

For all the columns/features, we see that the standard deviation is quite high and the min and max values are too far apart with the distribution being skewed towards lower values as can be seen from the **75% mark, mean and max**. This means that the given dataset has a lot of outliers and these outliers need to be dealt with. Simply ignoring the outliers will result in quite a lot of data loss

In [None]:
df.isna().sum()

Only 2 columns have null values. The missing values are small fraction of the entire dataset (1/8950) and (313/8950) and hence can be easily imputed. We'll impute CREDIT_LIMIT with mean value and since MINIMUM_PAYMENTS is a continuous variable skewed towards the lower side, we can impute it with either the mean or median. It shouldn't make much of a difference since this the fraction of missing values is quite small. We'll go with imputing with mean values.

In [None]:
df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].mean(skipna=True), inplace=True)
df['CREDIT_LIMIT'].fillna(df['CREDIT_LIMIT'].mean(skipna=True), inplace=True)

In [None]:
df.isna().sum()

Since there are too many outliers, let's convert the entire dataset's values to categorical values. Since we are interested in finding similarities through clusters, it is a good idea to group values in a particular range and assigning them a category. Later we'll normalize these category values as well to make sure that no large value in any column dominates/skews the clustering result.

In [None]:
columns = ['BALANCE', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'CREDIT_LIMIT','PAYMENTS', 'MINIMUM_PAYMENTS'] # All features with outlandish values

for c in columns:    
    Range = c+'_RANGE'
    df[Range]=0        
    df.loc[((df[c]>0)&(df[c]<=500)),Range]=1
    df.loc[((df[c]>500)&(df[c]<=1000)),Range]=2
    df.loc[((df[c]>1000)&(df[c]<=3000)),Range]=3
    df.loc[((df[c]>3000)&(df[c]<=5000)),Range]=4
    df.loc[((df[c]>5000)&(df[c]<=10000)),Range]=5
    df.loc[((df[c]>10000)),Range]=6

In [None]:
columns=['BALANCE_FREQUENCY', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY', 'PRC_FULL_PAYMENT']

for c in columns: 
    Range=c+'_RANGE'
    df[Range]=0
    df.loc[((df[c]>0)&(df[c]<=0.1)),Range]=1
    df.loc[((df[c]>0.1)&(df[c]<=0.2)),Range]=2
    df.loc[((df[c]>0.2)&(df[c]<=0.3)),Range]=3
    df.loc[((df[c]>0.3)&(df[c]<=0.4)),Range]=4
    df.loc[((df[c]>0.4)&(df[c]<=0.5)),Range]=5
    df.loc[((df[c]>0.5)&(df[c]<=0.6)),Range]=6
    df.loc[((df[c]>0.6)&(df[c]<=0.7)),Range]=7
    df.loc[((df[c]>0.7)&(df[c]<=0.8)),Range]=8
    df.loc[((df[c]>0.8)&(df[c]<=0.9)),Range]=9
    df.loc[((df[c]>0.9)&(df[c]<=1.0)),Range]=10

In [None]:
columns=['PURCHASES_TRX', 'CASH_ADVANCE_TRX']  

for c in columns:
    
    Range=c+'_RANGE'
    df[Range]=0
    df.loc[((df[c]>0)&(df[c]<=5)),Range]=1
    df.loc[((df[c]>5)&(df[c]<=10)),Range]=2
    df.loc[((df[c]>10)&(df[c]<=15)),Range]=3
    df.loc[((df[c]>15)&(df[c]<=20)),Range]=4
    df.loc[((df[c]>20)&(df[c]<=30)),Range]=5
    df.loc[((df[c]>30)&(df[c]<=50)),Range]=6
    df.loc[((df[c]>50)&(df[c]<=100)),Range]=7
    df.loc[((df[c]>100)),Range]=8

Since we have modified all the exisitng feature names, we can delete the existing feature names.

In [None]:
df.drop(['CUST_ID', 'BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',  'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT' ], axis=1, inplace=True)

X= np.asarray(df)
df.head()

In [None]:
columns = df.columns
for c in columns:
    if c == 'TENURE':
        continue
    large_values = dict(df[c].value_counts())
    lists = sorted(large_values.items())
    x, y = zip(*lists)
    plt.title(c)
    plt.plot(x, y)
    plt.show()

The above graphs show that the frequencies for lower values are high since most values in the data are small. This is evident from the **minimum, 1st quartile, median 3rd quartile and maximum values** from the data distribution we obtained by *df.describe()*. This process however took a certain number of trials but didn't consume much time. Now we normalize all the values to adjust them in the range of 0-1.

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X.shape

Now that we have converted data from continuous to discrete values and brought it down to a particular range, we've made sure that we give equal importance to all the features. Now, we can go ahead and apply K-means. Let's use **Elbow Method** to choose an optimal value of K.

In [None]:
clusters = 25
cost = []
for i in range(1,clusters):
    kmeans = KMeans(i)
    kmeans.fit(X)
    cost.append(kmeans.inertia_)

In [None]:
plt.plot(cost, 'ro-')

We seem to reach an inflection point when K = 6. After this value of K, the cost decreases very slowly.

In [None]:
kmeans = KMeans(6)
kmeans.fit(X)
labels = kmeans.labels_

The output of this step is a 'cluster' variable, which contains the cluster number for each record/row of the dataset. Let us add this variable at the end of the dataset.

In [None]:
clusters = pd.concat([df, pd.DataFrame({'cluster':labels})], axis=1)
clusters

In order to visualize the clusters created and see if they're well-defined, we need to reduce the dimensionality of the data since it's difficult to visualize n-dimensional data in 2 dimensional space. However, while reducing the dimensionality of the data,we want to make sure that we capture as many features of the original dataset as possible. For this, we use Principal Component Analysis (PCA), which helps us to achieve the objective mentioned above.

In [None]:
pca = PCA(2)
principalComponents = pca.fit_transform(X)
x, y = principalComponents[:, 0], principalComponents[:, 1]
print(principalComponents.shape)

colors = {0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'orange', 5:'purple'}

In [None]:
final_df = pd.DataFrame({'x': x, 'y':y, 'label':labels}) 
groups = final_df.groupby(labels)

Finally, we plot all the clusters as various subplots inside a single plot.

In [None]:
fig, ax = plt.subplots(figsize=(15, 10)) 

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=5, color=colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
    ax.tick_params(axis= 'y',which='both',left='off',top='off',labelleft='off')
    
ax.set_title("Customers Segmentation based on their Credit Card usage bhaviour.")
plt.show()