# Business Problem

This case requires to develop a customer segmentation to define marketing strategy. The sample dataset summarizes the usage behavior of about 9000 active credit card holders during the last 6 months. The file is at a customer level with 18 behavioral variables.

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
cust_data = pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')

In [None]:
cust_data.tail()

In [None]:
cust_data.info()

# Data Inspection

In [None]:
#convert column names to lower case for easy interpretation
cust_data.columns = cust_data.columns.str.lower()

In [None]:
cust_data.columns

In [None]:
cust_data.head()

In [None]:
## Create important derived variables

def month_avg_purchase(prch, tenure):
    return prch/tenure

def month_cash_advance(cash, tenure):
    return cash/tenure

def monthly_usage(blnc, limit):
    return blnc/limit

def prch_type(x, y):
    if ((x <= 0) & (y <= 0)):
        return 'none'
    elif((x > 0) & (y <= 0)):
        return 'one_off'
    elif((x <= 0) & (y > 0)):
        return 'installments'
    elif((x > 0) & (y > 0)):
        return 'both'
        

In [None]:
cust_data['monthly_avg_purchase'] = cust_data.apply(lambda x : month_avg_purchase(x['purchases'], x['tenure']), axis = 1)
cust_data['monthly_cash_advance'] = cust_data.apply(lambda x : month_cash_advance(x['cash_advance'], x['tenure']), axis = 1)
cust_data['monthly_usage'] = cust_data.apply(lambda x : monthly_usage(x['balance'], x['credit_limit']), axis = 1)
cust_data['purchase_type'] = cust_data.apply(lambda x : prch_type(x['oneoff_purchases'], x['installments_purchases']), axis = 1)


In [None]:
cust_data.head()

In [None]:
##Renaming the columns for better undersatnding
#purchases_frequency - freq of months wth atleast 1 purcase
#balance_frequency - balance in last 12months/ balance

cust_data.rename(columns = {'balance' : 'avg_monthly_balance'}, inplace=True)

In [None]:
cust_data.head()

## Profiling

In [None]:
import pandas_profiling
cust = cust_data.profile_report()
#cust.to_file(output_file = 'cust_segmentation_profile.html')

In [None]:
cust

From here we can see that Purchases is highly correlated with one-off purchases variable

### Identify Categorical and continous variables

In [None]:
cust_data.info()

In [None]:
numeric_var_names=[key for key in dict(cust_data.dtypes) if dict(cust_data.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]
cat_var_names=[key for key in dict(cust_data.dtypes) if dict(cust_data.dtypes)[key] in ['object']]
print(numeric_var_names)
print(cat_var_names)

In [None]:
cust_data.isna().sum()

## Data audit report

In [None]:
def continous_var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),x.std(),
                      x.var(), x.min(), x.quantile(0.01), x.quantile(0.05), x.quantile(0.10), x.quantile(0.25),
                      x.quantile(0.50), x.quantile(0.75), x.quantile(0.90), x.quantile(0.95), 
                      x.quantile(0.99), x.max()],
                    index = ['N', 'NMiss', 'Sum', 'Mean', 'Median', 'SD', 'Var', 'Min', 'P1', 'P5', 'P10','P25',
                            'P50', 'P75', 'P90', 'P95', 'P99', 'Max'])

In [None]:
def categorical_var_summary(x):
    Mode = x.value_counts().sort_values(ascending = False)[0:1].reset_index()
    return pd.Series([x.count(), x.isnull().sum(), Mode.iloc[0, 0], Mode.iloc[0,1],
                     round((Mode.iloc[0,1]*100)/x.count(), 2)],
                     index = ['N', 'NMiss', 'Mode', 'Freq', 'Percent'])

In [None]:
cust_data[numeric_var_names].apply(lambda x : continous_var_summary(x)).T.round(1)

In [None]:
cust_data[cat_var_names].apply(lambda x : categorical_var_summary(x)).T.round(1)

# Data Cleaning

#### 1. Outlier treatment

In [None]:
#Handling Outliers - at 99%tile or 95%tile if required 
def outlier_capping(x):
    x = x.clip(upper=x.quantile(0.95))
    x = x.clip(lower=x.quantile(0.05))
    return x

In [None]:
cust_data[numeric_var_names]=cust_data[numeric_var_names].apply(lambda x: outlier_capping(x))

#### 2. Missing value imputation

In [None]:
#Handling missings 
def Missing_imputation(x):
    x = x.fillna(x.mean())
    return x

In [None]:
cust_data[numeric_var_names]=cust_data[numeric_var_names].apply(lambda x: Missing_imputation(x))

In [None]:
cust_data[numeric_var_names].apply(lambda x : continous_var_summary(x)).round(3).T

In [None]:
### Correlation Matrix

In [None]:
cust_corr = cust_data.corr()
#cust_corr.to_excel('cust_corr.xlsx')
sns.heatmap(cust_corr)

1. purchases with one off & monthly_avg_purchase
2. one-off with monthly_avg_purchase
3. cahs_adv wth monthly_cash advance

We can drop purchases column and cash_adv based on our finding from correlation matrix

#### 3. Dummy variable creation

In [None]:
#function for creating dummy variable
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix = colname, drop_first= True)
    df = pd.concat([df, col_dummies], axis = 1)
    df.drop(colname, axis = 1, inplace= True)
    return df

In [None]:
cat_var_names = cust_data[cat_var_names].columns.difference(['cust_id'])
cat_var = cust_data[cat_var_names]
cat_var.head()

In [None]:
for c_feature in cat_var_names:
    cat_var[c_feature] = cat_var[c_feature].astype('category')
    cat_var = create_dummies(cat_var, c_feature)

In [None]:
cat_var.head()

#### 4. Dropping unecessary variables

In [None]:
#As cust_id is unique and have no variance so it may add unecessary noise to our data. Hence we need to drop it

cust_data.drop(columns=['cust_id'], inplace=True)

### Combining numeric and categorical data

In [None]:
data_final = pd.concat([cust_data[numeric_var_names], cat_var], axis = 1)

In [None]:
data_final.head()

# Standardizing the data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
data_final.columns

In [None]:
#prescreening of variables to remove less useful vraiable for segmentation
data_feature = data_final.drop(columns=['purchases', 'cash_advance'], axis = 1)

In [None]:
sc = StandardScaler()

In [None]:
data_final_scaled = pd.DataFrame(sc.fit_transform(data_feature))

In [None]:
data_final_scaled.head()

# Applying PCA to reduce the variables

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=21)
pca.fit(data_final_scaled)

In [None]:
pca.explained_variance_ 

In [None]:
#The amount of variance that each PC explains
var = pca.explained_variance_ratio_
var

In [None]:
#cummilative var explained
var1 = np.cumsum(np.round(pca.explained_variance_ratio_ , decimals=4)*100)
var1

In [None]:
pd.DataFrame({'Eigen Values' : pca.explained_variance_, 'Cumulative Variance' : var1}, index=range(1,22))

either 7 or 8 seems a perfect candidate for no of componenets in our clustering model

In [None]:
pca_final = PCA(n_components=7).fit(data_final_scaled)

In [None]:
pca_final.explained_variance_

In [None]:
reduced_cr = pca_final.fit_transform(data_final_scaled)

In [None]:
dimensions = pd.DataFrame(reduced_cr)
dimensions.columns = ["C1", "C2", "C3", "C4", "C5","C6", "C7"]

In [None]:
print(dimensions.shape)
print(dimensions.head())

# Clustering model(k-means)

In [None]:
from sklearn.cluster import KMeans

In [None]:
km_3 = KMeans(n_clusters=3, random_state=123)
km_3.fit(dimensions)

In [None]:
km_4 = KMeans(n_clusters=4, random_state=123).fit(dimensions)
km_5 = KMeans(n_clusters=5, random_state=123).fit(dimensions)
km_6 = KMeans(n_clusters=6, random_state=123).fit(dimensions)
km_7 = KMeans(n_clusters=7, random_state=123).fit(dimensions)
km_8 = KMeans(n_clusters=8, random_state=123).fit(dimensions)
km_9 = KMeans(n_clusters=9, random_state=123).fit(dimensions)

In [None]:
# save the cluster labels and sort by cluster
data_final['cluster_3'] = km_3.labels_
data_final['cluster_4'] = km_4.labels_
data_final['cluster_5'] = km_5.labels_
data_final['cluster_6'] = km_6.labels_
data_final['cluster_7'] = km_7.labels_
data_final['cluster_8'] = km_8.labels_
data_final['cluster_9'] = km_9.labels_

In [None]:
data_final.head(10)

### Segment Size check

In [None]:
data_final['cluster_3'].value_counts()

In [None]:
data_final['cluster_3'].value_counts()/sum(data_final['cluster_3'].value_counts())

In [None]:
data_final['cluster_4'].value_counts()/sum(data_final['cluster_4'].value_counts())

In [None]:
data_final['cluster_5'].value_counts()/sum(data_final['cluster_5'].value_counts())

In [None]:
data_final['cluster_6'].value_counts()/sum(data_final['cluster_6'].value_counts())

Here we can choose 5-6 cluster solution as optimum solution

# Quantitative Evaluation of model

### 1. Silhouette Coefficient(Higher the better)

In [None]:
from sklearn import  metrics

In [None]:
metrics.silhouette_score(dimensions, labels=km_3.labels_)

In [None]:
k_range = range(2, 16)
scores = []
for k in k_range:
    km = KMeans(n_clusters=k, random_state=123)
    km.fit(dimensions)
    scores.append(metrics.silhouette_score(dimensions, labels=km.labels_))

In [None]:
scores

In [None]:
plt.plot( k_range, scores)
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.grid('True')

Here 5 cluter seems optimal solution as the sc score is highest around it.

### 2. Elbow Analysis

In [None]:
cluster_range= range(2, 20)
errors = []

for num_clusters in cluster_range:
    clusters = KMeans( num_clusters )
    clusters.fit(dimensions)
    errors.append(clusters.inertia_)

In [None]:
errors

In [None]:
clusters_df = pd.DataFrame({'Cluster_no' : range(2, 20), 'Unexpalined_variance' : errors})
clusters_df[0:10]

In [None]:
%matplotlib inline
plt.plot(clusters_df.Cluster_no, clusters_df.Unexpalined_variance, marker = 'o')
plt.xlabel('No of clusters')
plt.ylabel('Unexplained Variance(error)')
plt.grid('True')


Here after cluster 7 the incremental decrease in error is almost constant

# Qualitative Analysis(Profiling)

In [None]:
data_final.head()

In [None]:
size = pd.concat([pd.Series(data_final.cluster_3.size), pd.Series(data_final.cluster_3.value_counts()).sort_index(), pd.Series(data_final.cluster_4.value_counts()).sort_index(),
          pd.Series(data_final.cluster_5.value_counts()).sort_index(), pd.Series(data_final.cluster_6.value_counts()).sort_index(), pd.Series(data_final.cluster_7.value_counts()).sort_index(),
          pd.Series(data_final.cluster_8.value_counts()).sort_index(), pd.Series(data_final.cluster_9.value_counts()).sort_index()])
size

In [None]:
Seg_size = pd.DataFrame(size, columns=['seg_size'])
Seg_pct = pd.DataFrame(size/data_final.cluster_3.size, columns= ['Seg_pct'])

In [None]:
pd.concat([Seg_size.T, Seg_pct.T], axis = 0)

In [None]:
# Mean value gives a good indication of the distribution of data. So we are finding mean value for each variable for each cluster
Profiling_output = pd.concat([data_final.apply(lambda x: x.mean()).T, data_final.groupby('cluster_3').apply(lambda x : x.mean()).T,
                             data_final.groupby('cluster_4').apply(lambda x : x.mean()).T, data_final.groupby('cluster_5').apply(lambda x : x.mean()).T, 
                             data_final.groupby('cluster_6').apply(lambda x : x.mean()).T, data_final.groupby('cluster_7').apply(lambda x : x.mean()).T,
                             data_final.groupby('cluster_8').apply(lambda x : x.mean()).T, data_final.groupby('cluster_9').apply(lambda x : x.mean()).T], axis =1)

In [None]:
Profiling_output

In [None]:
Profiling_output_final=pd.concat([Seg_size.T, Seg_pct.T, Profiling_output], axis=0)
Profiling_output_final.columns = ['Overall', 'KM3_1', 'KM3_2', 'KM3_3',
                                'KM4_1', 'KM4_2', 'KM4_3', 'KM4_4',
                                'KM5_1', 'KM5_2', 'KM5_3', 'KM5_4', 'KM5_5',
                                'KM6_1', 'KM6_2', 'KM6_3', 'KM6_4', 'KM6_5','KM6_6',
                                'KM7_1', 'KM7_2', 'KM7_3', 'KM7_4', 'KM7_5','KM7_6','KM7_7',
                                'KM8_1', 'KM8_2', 'KM8_3', 'KM8_4', 'KM8_5','KM8_6','KM8_7','KM8_8',
                                'KM9_1', 'KM9_2', 'KM9_3', 'KM9_4', 'KM9_5','KM9_6','KM9_7','KM9_8', 'KM9_9']

In [None]:
Profiling_output_final

In [None]:
Profiling_output_final.to_csv('Profiling_output1.csv')

From profiling we conclude that *__8 cluster solution__* seems the best one. Below is the detailed characteristic description of each cluster for future marketing strategy-

**Cluster 0** - These include customers with average credit limit who are mostly involved in one off type of purchases with their credit cards. They dont prefer cash transactions on their cards. They maybe targeted for offers on different partner outlets.

**Cluster 1** - These are the customers with high credit limit who spend alot on purchases of both installment and one-off type. Amount and number of transactions are quite high for these card holders. As a result the balance is quite low for them.

**Cluster 2** -  This cluster targets a group of customers who have a high balance and cash advances with low purchase frequency. We can assume that this customer segment uses their credit cards as a loan facility.

**Cluster 3** - This cluster includes uninvolved customers which rarely use their cards and that also on small amount of purchases. Hence they have low minimun payments inspite of decent credit limit. We may target them to diffrenet market strategies like emi/installments purchases.

**Cluster 4** - These customers are similar to cluster no. 2 but with lower balance and lower credit limit.

**Cluster 5** - These customers purchase frequently with highest amount of installment purchases contrast of a lower cash advance percentage. They have lower credit limit maybe that is the reason for not spending on other type of services. Also they pay their bill on time compared to other customers.

**Cluster 6** - These are the customers who frequently use all the services with high amount whether it be any kind of purchase or cash transactions. They have the highest credit limit and minimun payment. In short these are the involved customers.

**Cluster 7** -  These customers are almost similar to cluster no. 5 but with higher minimum payment and they don't pay their bill on time.