In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of Contents

* [Introduction and objectives](#introduction)
* [Get familiar with the data](#familiar)
* [Data preprocessing](#preprocessing) 
    - [Deal with null values](#preprocessing-one)
    - [Deal with feature dropping](#preprocessing-two)
    - [Deal with standardization](#preprocessing-three)
* [Build a machine learning for clustering (KMeans)](#classifier)
* [Analyzing the models' result](#analyze)
* [Conclusion](#conclusion)

***Written by:*** *Fakhrul Hasbi*

<a id="introduction"></a>
## Introduction

As a brief context, the dataset is a collection of customer-level credit card behaviours data recorded in 17 different attributes. The segmentation of customer with similar behaviour might help to create targeted marketing strategy for each clusters necessity and avoid the *one size fits all* strategy.

## Objectives

1. Identify the features as factors of grouping the customers into different clusters.
2. Predict the appropriate number of classes of clusters that group customers with similar behaviours.

<a id="familiar"></a>
## Get familiar with the data 

*-> **Import necessary** libraries*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
plt.style.use('fivethirtyeight')
%matplotlib inline

*-> **Read** the csv file*

In [None]:
df = pd.read_csv("/kaggle/input/ccdata/CC GENERAL.csv")

*-> **Quick** checking the dataframe*

In [None]:
df.head(5)

In [None]:
df.info()

<a id="preprocessing"></a>
## Data preprocessing 

<a id="preprocessing-one"></a>
### Deal with null values 

*-> **Checking null values.** If exists, then need to be removed.*

In [None]:
# make a heatmap to visualize the missing values
sns.heatmap(df.isnull(), cbar=False, yticklabels=False)

In [None]:
# drop the NaN rows
df.dropna(inplace=True)

In [None]:
# last checking to make sure that there are no more missing values in features
df.isnull().sum()

<a id="preprocessing-three"></a>
### Deal with feature dropping

*-> **Drop the CUST_ID** column since it is merely an identification for rows*

In [None]:
df.drop("CUST_ID", axis=1, inplace=True)

In [None]:
df.info()

<a id="preprocessing-five"></a> 
### Deal with standardization

*-> **Rescaling the data** to normalize all of the features unit*

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
df_scaled = pd.DataFrame(data=sc.fit_transform(df), columns=df.columns)

<a id="classifier"></a>
## Build a machine learning for clustering (KMeans)

*-> We may proceed to **build the model** using machine learning classifier*

In [None]:
from sklearn.cluster import KMeans

In [None]:
# initialize random number of clusters. Lets' say k=3
model = KMeans(n_clusters=3)
model.fit(df_scaled)

In [None]:
# check the inertia score
print(model.inertia_)

In [None]:
# basically, inertia score tells us how far away points within a cluster.Thus intuitively speaking, we aim to minimize the inertia score. Yet, we want for small inertia and small number of clusters.
inertia_score = []
for k in range(1,15):
    model = KMeans(n_clusters=k)
    model.fit(df_scaled)
    inertia_score.append(model.inertia_)

In [None]:
# plot the inertia scores with k=1 to k=10
# it seems that the elbow position (when its stop decrese significantly, either k=6, k=7 or k=8)
plt.plot([*range(1,15)], inertia_score, marker='o', markersize=12)

In [None]:
# initialize KMeans model again with k=
model = KMeans(n_clusters=7)
predicted_labels = model.fit_predict(df_scaled)

<a id="analyze"></a>
## Analyzing the models' result 

*-> **Analyze** the model result. Refer to the comment for further detail.*

In [None]:
# check all of the graphs all at once
sns.pairplot(df)

In [None]:
# make a new column for data frame: predicted_class
df['predicted_class'] = predicted_labels

In [None]:
# filter out the features by selecting the best correlated features
df_corr = df.drop("predicted_class", axis=1).corr().abs()
best_correlated_features = []
for col in df_corr.columns:
    feature = df_corr[col].drop(labels=[col])
    best = feature[feature == feature.max()].index[0]
    best_correlated_features.append((col, best))
best_correlated_features = set(tuple(sorted(combination)) for combination in best_correlated_features) # filtering duplicate compbination regardless the order

In [None]:
# print all of the best correlated features
best_correlated_features

In [None]:
# graph all of the best correlated features
for (x, y) in best_correlated_features:
    plt.figure()
    sns.scatterplot(data=df, x=x, y=y, hue='predicted_class', palette='coolwarm')
    plt.title("{} against {}".format(x, y))

In [None]:
# graph the payment and purchase because more easier to interpret intuitively
plt.figure()
sns.scatterplot(data=df, x="PAYMENTS", y="PURCHASES", hue='predicted_class', palette="coolwarm")
plt.title("Payments against Purchases")

In [None]:
# print cluster center for classes k=7: df_scaled
df_centroids = pd.DataFrame(data=model.cluster_centers_, columns=df.drop("predicted_class", axis=1).columns)
df_centroids

<a id="conclusion"></a>
## Conclusion

*-> **End of the notebook assignment:** from one of the graph above (payment and purchase) with k=6, shows a group of customers that spend higher in purchase as well as payment, yet still there are only small group of customers within this cluster. The marketing stratgy might be applied to this cluster.*