In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Problem Statement:
### We have data of about 9000 credit card holders for last 6 months. Our job is to group these customers based on their credit card usage.

In [None]:
# import the required packages
import pandas as pd # used of data wrangling also known as MS Excel for Python
import numpy as np # used for large datasets in array
import seaborn as sns # package to visualize data
import matplotlib.pyplot as plt # another package to visualize data
from sklearn.preprocessing import StandardScaler, normalize # to use preprocessing of data
from sklearn.cluster import KMeans # The main package: The ML itself
from sklearn.decomposition import PCA # package for dimentionality reductions

In [None]:
# load the data in pandas DataFrame and check the 1st head (1st 5 rows)
creditcard_df = pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')
creditcard_df.head()

In [None]:
# CUSTID: Identification of Credit Card holder 
# BALANCE: Balance amount left in customer's account to make purchases
# BALANCE_FREQUENCY: How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
# PURCHASES: Amount of purchases made from account
# ONEOFFPURCHASES: Maximum purchase amount done in one-go
# INSTALLMENTS_PURCHASES: Amount of purchase done in installment
# CASH_ADVANCE: Cash in advance given by the user
# PURCHASES_FREQUENCY: How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
# ONEOFF_PURCHASES_FREQUENCY: How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
# PURCHASES_INSTALLMENTS_FREQUENCY: How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
# CASH_ADVANCE_FREQUENCY: How frequently the cash in advance being paid
# CASH_ADVANCE_TRX: Number of Transactions made with "Cash in Advance"
# PURCHASES_TRX: Number of purchase transactions made
# CREDIT_LIMIT: Limit of Credit Card for user
# PAYMENTS: Amount of Payment done by user
# MINIMUM_PAYMENTS: Minimum amount of payments made by user  
# PRC_FULL_PAYMENT: Percent of full payment paid by user
# TENURE: Tenure of credit card service for user

## WOAH, hold on a minute. Where are the labels??
### Suprise, we don't have labels because this is Unsupervised Learning. Unsupervised Learning is a kind of machine learning where the goal is to looks for patterns and segregate them based on their features. It's like how a baby differentiate between a cat and dog without actually knowing which one is called cat or which one is dog. The algorithm look for "features" to distinguish the groups.

In [None]:
# check the datatypes and null values in columns using the "info()"
creditcard_df.info()

### Note that we have null values in columns CREDIT_LIMIT and MINIMUM_PAYMENTS, we will address this later

In [None]:
# get more statistical insights using "describe()"
creditcard_df.describe()

# Visualize and Explore Dataset

In [None]:
# lets revist the missing data
import seaborn as sns
sns.heatmap(creditcard_df.isnull(), yticklabels=False, cbar=False, cmap='Blues')

The above helps for the "MINIMUM_PAYMENTS" column, but what about the other missing data columns, they not very much visible. Lets try a different method

In [None]:
# This gives more clear sense of missing data in a tablular form
creditcard_df.isnull().sum()

In [None]:
# lets fill up the missing elements with mean of the "MINIMUM_PAYMENTS"
# But first lets try out the parts of the code before fully write it in single line
# The below gives the True/False against each row based on "MINIMUM_PAYMENTS" is null(TRUE) or not(FALSE)
creditcard_df['MINIMUM_PAYMENTS'].isnull() == True

In [None]:
# In order to replace the null values, we have to identity the locations of these row
# the below gives only the rows where "MINIMUM_PAYMENTS" are missing values or where the above code gave FALSE
creditcard_df.loc[(creditcard_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS']

In [None]:
# Finally, lets replace the above row values with mean()
creditcard_df.loc[(creditcard_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = creditcard_df['MINIMUM_PAYMENTS'].mean()

In [None]:
#let's check again for any missing value
creditcard_df.isnull().sum()

Great, no missing values in "MINIMUM_PAYMENTS" now but we still have a missing data for "CREDIT_LIMIT" column

In [None]:
# Lets do the same again for the "CREDIT_LIMIT" column
creditcard_df.loc[(creditcard_df['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = creditcard_df['CREDIT_LIMIT'].mean()

In [None]:
# Probably the last check for missing values
creditcard_df.isnull().sum()

Great, no missing values anywhere

In [None]:
# lets check if we have any duplicate rows/entries in the dataset
creditcard_df.duplicated().sum()

This returns 0 rows meaning our dataset do not have any duplicate entries

In [None]:
# lets drop the column "CUST_ID" as it is just an ID and not a feature. In 99% cases, we drop the ID unless they can be used to derive otehr features
creditcard_df.drop('CUST_ID', axis=1, inplace=True)
creditcard_df.head()

In [None]:
# lets check out the column names and count
creditcard_df.columns

In [None]:
n=len(creditcard_df.columns)
n

In [None]:
creditcard_df.info()

## Visualize the data.
Now its to plot graphs and see what we can derive just by looking at different features. We will do this by using Distribution Plot (distplot) from matplotlib.hist and KDE Plot (kdeplot) from seaborn library.
* KDE Plot represents the Kernel Density Estimate
* KDE is used for visualizing the Probability Density of a continuous variable. 
* KDE demonstrates the probability density at different values in a continuous variable. 

In [None]:
# distplot with KDE
plt.figure(figsize=(10,50))
for i in range(len(creditcard_df.columns)):
    plt.subplot(17,1,i+1)
    sns.distplot(creditcard_df[creditcard_df.columns[i]],kde_kws={'color':'b', 'lw':3, 'label':'KDE', 'bw':0.1}, hist_kws={'color':'g'})
    #sns.distplot(creditcard_df[creditcard_df.columns[i]],kde_kws={'color':'b', 'lw':3, 'label':'KDE'})
    plt.title(creditcard_df.columns[i])
    
plt.tight_layout()
# Few observations
# Mean of balance is somewhere between $1000 and $2000
# 'Balance_Frequency' for most customers is updated frequently at 1
# For 'PURCHASES_FREQUENCY', there are two distinct group of customers at 0 and 1
# For 'ONEOFF_PURCHASES_FREQUENCY' and 'PURCHASES_INSTALLMENT_FREQUENCY' most users don't do one off puchases or installment purchases frequently 
# Very small number of customers pay their balance in full 'PRC_FULL_PAYMENT'~0
# Average credit limit is around $5000
# Most customers have tenure between 11 and 12

In [None]:
# Correlation is used to see the relation between features.
# Positive correlation mean the features are direcltly proportional and negetive means inversely proportional
correlations = creditcard_df.corr()
f, ax = plt.subplots(figsize=(20,10))
sns.heatmap(correlations,annot=True)

## Theory behind K-Means
The objective of K-means is simple, identify patterns in data points and group (k) similar data points together. The "k" is the number of clusters to be define. In other words, K-means algorithm identify k number of centroids and then allocates all the data point to each of these centroid to their nearest cluster maintaining the distance as small as possible. The steps are as follows:
1. Choose number of clusters "K".
2. Select random "K" points in the data hyperspace.
3. Assign each data point to its nearest centroid, hence creating "K" number of clusters.
4. Sum the distance between each data point and its nearest/assign centroid.
5. Re-arrange the centroid so that the sum is moving towards minimum.
6. Go to step 4 and repeat until condition 7 or 8 is met.
7. There is no change in the sum of distance.
8. Pre-defined number of iterations are reached

 # Elbow Methods
![](http://)

In [None]:
#TODO: Add details

# Implement Elbow Method

In [None]:
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(creditcard_df)
creditcard_df_scaled.shape

In [None]:
creditcard_df_scaled

In [None]:
scores_1 = []
range_values = range(1,20)

for i in range_values:
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(creditcard_df_scaled)
    scores_1.append(kmeans.inertia_)
    
plt.plot(scores_1,'bx-')

# Apply K-Means method

In [None]:
kmeans = KMeans(7)
kmeans.fit(creditcard_df_scaled)
labels = kmeans.labels_

In [None]:
kmeans.cluster_centers_.shape

In [None]:
cluster_centers = pd.DataFrame(data=kmeans.cluster_centers_, columns=[creditcard_df.columns])
cluster_centers

In [None]:
# To understand the data better, perform the inverse transformation
cluster_centers = np.round(scaler.inverse_transform(cluster_centers),4)
cluster_centers = pd.DataFrame(data=cluster_centers, columns=[creditcard_df.columns])
cluster_centers

In [None]:
labels

In [None]:
labels.shape

In [None]:
labels.max()

In [None]:
labels.min()

In [None]:
y_kmeans = kmeans.fit_predict(creditcard_df_scaled)
y_kmeans # this should be same as labels???

In [None]:
creditcard_df_cluster = pd.concat([creditcard_df, pd.DataFrame({'cluster':labels})], axis=1)
creditcard_df_cluster.head()

In [None]:
# plot the histogram of various clusters
for i in creditcard_df.columns:
    plt.figure(figsize=(35,5))
    for j in range(7):
        plt.subplot(1,7,j+1)
        cluster = creditcard_df_cluster[creditcard_df_cluster['cluster'] == j]
        cluster[i].hist(bins = 20)
        plt.title('{}   \nCluster {}'.format(i,j))
        
plt.show()

# Principal Component Analysis (PCA)

In [None]:
# TODO: Add details

In [None]:
# Obtain the principal component
pca = PCA(n_components=2)
principal_comp = pca.fit_transform(creditcard_df_scaled)
principal_comp

In [None]:
pca_df = pd.DataFrame(data=principal_comp, columns=['pca1','pca2'])
pca_df.head()

In [None]:
# concatenate the clusters labels to the dataframe
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis=1)
pca_df.head()

In [None]:
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x='pca1', y='pca2', hue='cluster', data=pca_df, palette=['red','green','blue','pink','yellow','gray','purple'])