In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
#Loading the data
creditCardData = pd.read_csv("../input/ccdata/CC GENERAL.csv")

#Features and Meaning
# CUSTID: Identification of Credit Card holder 
# BALANCE: Balance amount left in customer's account to make purchases
# BALANCE_FREQUENCY: How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
# PURCHASES: Amount of purchases made from account
# ONEOFFPURCHASES: Maximum purchase amount done in one-go
# INSTALLMENTS_PURCHASES: Amount of purchase done in installment
# CASH_ADVANCE: Cash in advance given by the user
# PURCHASES_FREQUENCY: How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
# ONEOFF_PURCHASES_FREQUENCY: How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
# PURCHASES_INSTALLMENTS_FREQUENCY: How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
# CASH_ADVANCE_FREQUENCY: How frequently the cash in advance being paid
# CASH_ADVANCE_TRX: Number of Transactions made with "Cash in Advance"
# PURCHASES_TRX: Number of purchase transactions made
# CREDIT_LIMIT: Limit of Credit Card for user
# PAYMENTS: Amount of Payment done by user
# MINIMUM_PAYMENTS: Minimum amount of payments made by user  
# PRC_FULL_PAYMENT: Percent of full payment paid by user
# TENURE: Tenure of credit card service for user


In [None]:
creditCardData

In [None]:
#Getting some insights from our data
creditCardData.info()
#and we observe some Null data at #MINIMUM_PAYMENTS and #CREDIT_LIMIT

In [None]:
#In case we need more statistical insights let's use .describe()
creditCardData.describe()

In [None]:
#We can get a rough idea of our missing Data using a heatmap
sns.heatmap(creditCardData.isnull(),yticklabels = False,cbar = False, cmap = "Blues",linecolor = "Black")

In [None]:
#or
creditCardData.isnull().sum()

In [None]:
#Let's fill the #MINIMUM_PAYMENTS with the mean
creditCardData.loc[(creditCardData['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = creditCardData['MINIMUM_PAYMENTS'].mean()
#accesed each row of with MINIMUM_PAYMENTS col Null and replaced it with the mean of the column

In [None]:
#Checking again
creditCardData.isnull().sum()
#MINIMUM_PAYMENTS is fixed

In [None]:
#let's fix credit limit with the same way
creditCardData['CREDIT_LIMIT'].isnull().sum()

In [None]:
creditCardData.loc[(creditCardData['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = creditCardData['CREDIT_LIMIT'].mean()
creditCardData['CREDIT_LIMIT'].isnull().sum()

In [None]:
#Let's drop the CUST_ID column because we have no need for it
creditCardData.drop(['CUST_ID'],axis = 1, inplace = True)

In [None]:
#Let's see if length changed from 18 to 17
n = len(creditCardData.columns)
n

In [None]:
#Columns Remaining
creditCardData.columns

In [None]:
#Heatmap with the corellation matrix
plt.matshow(creditCardData.corr())

In [None]:
f, ax = plt.subplots(figsize = (20,10))
sns.heatmap(creditCardData.corr(),annot = True)

In [None]:
#Let's Apply Elbow Method to get optimal 'K' for our model

In [None]:
#First let's scale our Data
scaler = StandardScaler()

In [None]:
#Scale and fit our data
creditCardScaled = scaler.fit_transform(creditCardData)

In [None]:
creditCardScaled

In [None]:
#Initialize the for loop to calculate and store the WCSS for k 1 to 30
score = []

In [None]:
range_val = range(1,30)

In [None]:
for i in range_val:
    model = KMeans(n_clusters = i)
    model.fit(creditCardScaled)
    score.append(model.inertia_)

In [None]:
#Let's plot our WCSS over the range
plt.figure(figsize= (10,10))
plt.plot(score,'bx-')
plt.xticks(np.arange(0,30, step = 1))


In [None]:
#as we can see the k before the plot get's linear is 8
#so our optimal k for our Data is k = 8

In [None]:
#Let's apply k-means
model = KMeans(8)
model.fit(creditCardScaled)
labels = model.labels_

In [None]:
#Let's get each centroid for each feature of each cluster
model.cluster_centers_.shape

In [None]:
cluster_centroids = pd.DataFrame(data = model.cluster_centers_,columns = [creditCardData.columns])

In [None]:
cluster_centroids

In [None]:
#Let's inverse our scaled feature to their initial form
cluster_centroids = scaler.inverse_transform(cluster_centroids)
cluster_centroids = pd.DataFrame(data = cluster_centroids,columns = [creditCardData.columns])

In [None]:
cluster_centroids

In [None]:
#Now we can extract certain groups according to our needs


In [None]:
#Now let's label our data according to the cluster they belong
labels.shape

In [None]:
labels.max()

In [None]:
labels.min()

In [None]:
#Also let's set the predictor
y_model = model.fit_predict(creditCardScaled)
y_model

In [None]:
creditCardDataLabeled = pd.concat([creditCardData,pd.DataFrame({'cluster':labels})],axis = 1)

In [None]:
creditCardDataLabeled

In [None]:
#Histogram for each cluster
for i in creditCardData.columns:
    plt.figure(figsize = (35,5))
    for j in range(8):
        plt.subplot(1,8,j+1)
        cluster = creditCardDataLabeled[creditCardDataLabeled['cluster'] == j]
        cluster[i].hist(bins = 20)
        plt.title('{} \nCluster {}'.format(i,j))
        
    plt.show()    
  

In [None]:
#Let's Apply PCA
#Get principal componets
pca = PCA(n_components = 2)
principal_comp = pca.fit_transform(creditCardScaled)
principal_comp

In [None]:
pca_dataframe = pd.DataFrame(data = principal_comp,columns = ['component_1','component_2'])
pca_dataframe

In [None]:
#Concat Label with pca
pca_dataframe = pd.concat([pca_dataframe,pd.DataFrame({'cluster':labels})],axis = 1)
pca_dataframe

In [None]:
#Let's finally plot the clusters
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x = "component_1", y = "component_2", hue = "cluster",data = pca_dataframe, palette = ['green','red','pink','blue','gray','yellow','black','purple'])

In [None]:
#the_end