In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The goal of this project is to leverage AI/ ML model to segment customers for launching a specific targeted Ad-campaign. To make it successful, we have to segment them in at-least 3 distinct groups known as "marketing segmentation". It will help to maximize the marketing campaign conversion rate. For example the general four segments are:
1. Transactors: Customers who pay least amount of interest and very careful with the money. Generally they have lower balance(USD 104), cash advance (USD 303) and perecnt of full paymenet = 23% 
2. Revolvers : (Most lucrative sector) use credit card as a loan, generally they have highest balance (USD 5000), cash advance (USD 5000), low purchase frequency, high cash advance frequency (0.5), high cash advance transactions (16).
3. VIP/Prime : (This group is specific target to increase credit limit and spend habbit) High credit limit (USD 16K), high percentage of full payment.
4. Low Tenure: Low tenure (7 Years), low balance.

The steps performed in this task are:
1. Visualize and explore datasets
2. Scikit-Learn library to find the optimal number of clusters using elbow method
3. k-means using Scikit-Learn to perform customer segmentation
4. Principal Component Analysis (PCA) technique to perform dimensionality reduction and data visualization

## Note: This notebook will be updated with the passage of time. Your feedback will be highly appreciated. Please upvote, if you like it and find it helpful. Your support in terms of upvote and positive feedback will keep me motivated :)

In [None]:
# import libraries
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
creditcard_df = pd.read_csv('/kaggle/input/credit-card-segmentation/CC GENERAL.csv')

In [None]:
creditcard_df

In [None]:
# By using following info function, we can see data types and get to know about null value existance 
#(i.e, credit limit and Min payments)
creditcard_df.info()

In [None]:
# By using following descibe function, we can get to know about important features of a coulmn, i.e, min, max and mean values
creditcard_df.describe()
#This helps to give insights about data, i.e, Balance is frequently updated on average ~0.9, scale-->(0,1)
# On average 15 percent people make full payment using CC

In [None]:
# Suppose we want to know about a person who made maximum "ONEOFF_PURCHASES" which is "40761.250000" given in above "describe" fun.
creditcard_df[creditcard_df['ONEOFF_PURCHASES']==40761.250000]

In [None]:
# Now lets get the features of customer who made the maximum cash advance transactions.
creditcard_df[creditcard_df['CASH_ADVANCE']>= 47137]

## Visualize and Explore Data

In [None]:
# Lets check misiing values, it seems that we have very less amount of missing values
sns.heatmap(creditcard_df.isnull(), yticklabels= False, cbar =False, cmap = 'winter_r')

In [None]:
#We can see we have 1 null value in "CREDIT_LIMIT" and 313 in "Minimum_Payments"
creditcard_df.isnull().sum()

In [None]:
# Lets fill these missing values with meab
creditcard_df.loc[(creditcard_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = creditcard_df['MINIMUM_PAYMENTS'].mean()
#We will use an alternate method to fill NAN value with mean in "CREDIT_LIMIT" coulmn
creditcard_df['CREDIT_LIMIT'].fillna(value=creditcard_df['CREDIT_LIMIT'].mean(), inplace= True)

In [None]:
creditcard_df['MINIMUM_PAYMENTS'].isnull().sum()

In [None]:
creditcard_df['CREDIT_LIMIT'].isnull().sum()

In [None]:
# So now we can see that we dont have any missing values left
sns.heatmap(creditcard_df.isnull(), yticklabels= False, cbar =False, cmap = 'winter_r')

In [None]:
# Now lets see if we have any duplicated entries and the result shows that all entries are unique
creditcard_df.duplicated().sum()

In [None]:
# Lets drop the ID column which dosent provide any info but a sequentail order
creditcard_df.drop(columns= 'CUST_ID', axis = 1, inplace= True)

In [None]:
print( 'Number of columns = {}'.format(len(creditcard_df.columns)))

In [None]:
creditcard_df.columns

In [None]:
# Now er are going to use dist_plot which is a combination of "hist" function in matplotlib and "KDE" in seaborn
# KDE is used to plot the probability distribution function of a variable
plt.figure(figsize=(10,50))
for i in range (len(creditcard_df.columns)):
    plt.subplot(17,1,i+1)
    sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws= {'color' : 'b', 'lw': 3, 'label': 'KDE', 'bw': 1.5}, hist_kws= {'color' : 'g'})
    plt.title(creditcard_df.columns[i])
plt.tight_layout()

In [None]:
creditcard_df.head()

Now we will plot correlation between features.
The correlation coefficient has values between -1 to 1.
1.  A value closer to 0 implies weaker correlation (exact 0 implying no correlation)
2.  A value closer to 1 implies stronger positive correlation
3.  A value closer to -1 implies stronger negative correlation

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))
corr = creditcard_df.corr()
sns.heatmap(corr, annot=True, cmap=plt.cm.Reds)
plt.show()

## K-Means Algorithm

K- means is an un-supervised machine learning algorithm. It groups data in clusters in an un-supervised fashion. It uses Euclidian distance to measure similarity between attribute values. For more details, please click [here](https://towardsdatascience.com/machine-learning-algorithms-part-9-k-means-example-in-python-f2ad05ed5203).

## Elbow Method

Elbow method is a very popular method to calculate optimal number of clusters for a given problem. Within clusters the sum of 
square distance is calculated and plotted against the number of clusters. The elbow point in plot is selected as optimal number
of clusters for given problem. For more detalis, please click [here](https://predictivehacks.com/k-means-elbow-method-code-for-python/#:~:text=K-Means%20Elbow%20Method%20code%20for%20Python.%20K-Means%20is,number%20is%20not%20optimal%20for%20the%20specific%20case.)

In [None]:
#Lets re-scale data
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(creditcard_df)

In [None]:
creditcard_df_scaled.shape

In [None]:
creditcard_df_scaled

In [None]:
# Now we are going to implement Elbow method to final optimal number of clusters
first_score = []
for i in range(1,20):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(creditcard_df_scaled)
    first_score.append(kmeans.inertia_) #inertia gives the within cluster distance of each point from its centroid as we discussed above.
plt.plot (first_score, 'bx')

In [None]:
# We can see from above plot that the optimal number of clusters in this case are 7 or 8.
# So lets apply kmeans method.
kmeans = KMeans(7)
kmeans.fit(creditcard_df_scaled)
labels = kmeans.labels_ #labels --> clusters

In [None]:
labels

In [None]:
kmeans.cluster_centers_.shape

In [None]:
# Lets create a dataframe consists of cluster centers
cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns= [creditcard_df.columns])
cluster_centers

In [None]:
# As the data is scaled so lets perform inverse transform to know better what this data actually means
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns= [creditcard_df.columns])
cluster_centers
#We can seprate the four clusters given at the start of problem (i.e, Transactors, VIP) by monitoring the given attributes.

In [None]:
labels.shape # values associated to each poin

In [None]:
labels.max()

In [None]:
labels.min()

In [None]:
# Now we can have the label associated with each point
ykmeans = kmeans.fit_predict(creditcard_df_scaled)
ykmeans

In [None]:
#Lets concatenate the cluster labels with original data, which will help to plot the histograms of each cluster
creditcard_df_cluster = pd.concat([creditcard_df, pd.DataFrame({'cluster': labels})], axis = 1)
creditcard_df_cluster.head()

In [None]:
# Now lets plot histogram of each cluster
for i in creditcard_df.columns:
    plt.figure(figsize=(35,5))
    for j in range(7):
        plt.subplot(1,7,j+1)
        cluster = creditcard_df_cluster[creditcard_df_cluster['cluster']==j]
        cluster[i].hist(bins=20)
        plt.title('{} \nCluster {} '.format(i,j))
    plt.show()

## Principal componenet Analysis (PCA)

1. PCA is an unsupervised ML algorithm that tries to reduce the dimension of data while preserving the actual information. 
2. PCA algorithm could be used for dimensionality reduction by trying to find a new set of features called components while maintaining the original information. 
For more details and real time example, please click [here](https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60)

In [None]:
#Lets convert our data to only 2D using PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(creditcard_df_scaled)
pca_components

In [None]:
# create a dataframe of these two componenets
pca_df = pd.DataFrame(data = pca_components, columns = ['pca1', 'pca2'])
pca_df.head()

In [None]:
#concatenate with labels
pca_df = pd.concat([pca_df, pd.DataFrame({'cluster': labels})], axis = 1)
pca_df.head()

In [None]:
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x='pca1', y='pca2', hue = 'cluster', data=pca_df, palette=['red', 'green', 'blue', 'pink', 'yellow', 'gray', 'black'])
plt.show()

## Acknowledgment

I am really thankful to [Coursera](https://www.coursera.org/projects/machine-learning-for-customer-segmentation) and [Ryan Ahmed](https://www.coursera.org/instructor/~48777395) for proving such a valuable opportunity to learn using real time project.