# Table of Contents

* [Getting started](#getting_started)
* [Data preprocessing](#preprocessing)
    * [Check for missing values](#imputation)
    * [Check for duplicates](#duplicates)
    * [Check for unique customer identifier](#unique)
    * [Check for outliers](#outliers)
    * [Robust scaler](#standardize)
* [Exploratory data](#EDA)
    * [Data types](#dtypes)
    * [Summary statistics](#summary_stats)
    * [Bootstrapping](#bootstrapping)
    * [Distributions](#distributions)
    * [Spearman correlation](#correlation)
    * [Multiple linear regression](#multiple_linear_regression)
* [Modeling](#modeling)
    * [K-means clustering](#cluster)
        * [Find optimal number of clusters](#elbow)
        * [Cluster customers](#cluster_customers)
    * [SMOTE](#SMOTE)
    * [Model selection](#model_selection)
        * [Decision tree classifier](#decision_tree)
        * [Multinomial logistic regression](#logistic_regression)
    * [Feature importance](#feature_importance)
    * [Visualize decision tree](#visualize)
* [Summarize insights from analysis](#summary)

# Getting started <a class="anchor" id="getting_started"></a>

In [None]:
# Link to dataset
# https://www.kaggle.com/arjunbhasin2013/ccdata?select=CC+GENERAL.csv

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import math

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score, classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

from collections import Counter
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
# # Read and preview data
try:
    df = pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')
except:
    df = pd.read_csv('CC GENERAL.csv')
print(df.shape)
df.head()

In [None]:
# Description of columns

# CUST_ID : Identification of Credit Card holder (Categorical)
# BALANCE : Balance amount left in their account to make purchases
# BALANCE_FREQUENCY : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)
# PURCHASES : Amount of purchases made from account
# ONEOFF_PURCHASES : Maximum purchase amount done in one-go
# INSTALLMENTS_PURCHASES : Amount of purchase done in installment
# CASH_ADVANCE : Cash in advance given by the user
# PURCHASES_FREQUENCY : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)
# ONEOFF_PURCHASES_FREQUENCY : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)
# PURCHASES_INSTALLMENTS_FREQUENCY : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)
# CASH_ADVANCE_FREQUENCY : How frequently the cash in advance being paid
# CASH_ADVANCE_TRX : Number of Transactions made with "Cash in Advanced"
# PURCHASES_TRX : Number of purchase transactions made
# CREDIT_LIMIT : Limit of Credit Card for user
# PAYMENTS : Amount of Payment done by user
# MINIMUM_PAYMENTS : Minimum amount of payments made by user
# PRC_FULL_PAYMENT : Percent of full payment paid by user
# TENURE : Tenure of credit card service for user

In [None]:
# Create a list of all the features
features = ['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
       'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
       'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE']

# Preprocessing <a class="anchor" id="preprocessing"></a>

## Check for missing values <a class="anchor" id="imputation"></a>

In [None]:
# Check for columns with missing values and impute them with the median.
def impute_nan(df):
    """Check for columns with missing values and impute them with the median."""
    nan_cols = df.columns[df.isnull().any()].tolist()
    nan_length = len(nan_cols)
    if nan_length == 0:
        return df
    else:
        print('Imputed features:', nan_cols)
        for x in nan_cols:
            df[x].fillna(df[x].median(), inplace=True)
        return df

df = impute_nan(df)

## Check for duplicate rows <a class="anchor" id="duplicates"></a>

In [None]:
# Check for duplicates
def duplicates(x):
    """Check for duplicate rows. 
    Either dedupe or return original dataset if there are no duplicate rows."""
    y = x.drop_duplicates()
    duplicate_rows = x.shape[0] - y.shape[0]
    print('Duplicate rows:', duplicate_rows)
    if duplicate_rows == 0:
        return x
    else:
        return y

df = duplicates(df)

## Check for unique customer identifier <a class="anchor" id="unique"></a>

In [None]:
custs = df.CUST_ID.nunique()
rows = df.shape[0]

if custs == rows:
    print('CUST_ID is unique')
else:
    print('Need to create a unique identifier')

## Check for outliers <a class="anchor" id="outliers"></a>

In [None]:
def outliers(df, features):
    """Count the number of outliers for each feature using the IQR"""
    num_outliers = []
    pct_outliers = []
    total_rows = []
    interquartile_range = []
    for i in features:
        Q1 = df[i].quantile(.25)
        Q3 = df[i].quantile(.65)
        IQR = Q3 - Q1
        outliers = len(df[(df[i] < (Q1-1.5*IQR)) | (df[i] > (Q3+1.5*IQR))])
        rows = len(df[i])
        pct = outliers/rows
        interquartile_range.append(IQR)
        num_outliers.append(outliers)
        pct_outliers.append(pct)
        total_rows.append(rows)
        
    count_outliers = pd.DataFrame({'Feature': features
                               , 'Num_Outliers': num_outliers
                                , 'Percent_Outliers': pct_outliers
                                , 'IQR': interquartile_range
                               , 'Total_Rows': total_rows}).sort_values('Percent_Outliers', ascending=False)
    return count_outliers

count_outliers = outliers(df=df, features=features)
count_outliers

In [None]:
# Visualize outliers with box plots
for i in features:
    sns.boxplot(df[i])
    plt.show()

## Robust scaler <a class="anchor" id="standardize"></a>
    Since most of the features contain a non-trivial amount of outliers, use the robust scaler instead of the standard scaler. The robust scaler uses the median and IQR, which are better estimates of central tendency in the precense of outliers.

In [None]:
# Scale the features
X = df[features]
X_scaled = pd.DataFrame(RobustScaler().fit_transform(X), columns=X.columns, index=X.index)
df_scaled = pd.concat([df['CUST_ID'], X_scaled], axis=1)

# Exploratory analysis <a class="anchor" id="EDA"></a>

## Data types <a class="anchor" id="dtypes"></a>

In [None]:
df_scaled[features].info()

## Summary statistics <a class="anchor" id="summary_stats"></a>
    Mean, median, min and max

In [None]:
(df_scaled[features].describe().transpose()
     [['mean', '50%', 'min', 'max']]
     .rename(columns={'50%': 'median'})
     .style.background_gradient(cmap = 'RdYlGn'))

## Bootstrapping <a class="anchor" id="bootstrapping"></a>
    Estiamte the population mean for each feature with bootstrap sampling.

In [None]:
# Bootstrap sampling
samples = 1000
est_popu_means = {}
boot = []
for a in features:
    for b in range(samples):
        c = df[a].sample(frac = 0.33, random_state = 1).mean()
        boot.append(c)
    p_mean = sum(boot)/len(boot)
    est_popu_means[a] = p_mean
    boot.clear()

# Mean of each feature in the dataset
sampling_mean = []
for i in features:
    x = df[i].mean()
    sampling_mean.append(x)

df_means = pd.DataFrame({'Feature': list(est_popu_means.keys())
                       , 'Bootstrap_Mean': list(est_popu_means.values())
                       , 'Mean_in_dataset': sampling_mean})

df_means

## Distributions <a class="anchor" id="distributions"></a>
    Plot the distribution of each feature with its estimate population mean from the bootstrap sampling.

In [None]:
for i in features:
    boot_mean = df_means.loc[df_means['Feature'] == i, 'Bootstrap_Mean'].iloc[0]
    print(i)
    print('Bootstrap mean:', boot_mean)
    sns.histplot(df[i])
    plt.axvline(boot_mean, color = 'red')
    plt.show()

## Spearman correlation <a class="anchor" id="correlation"></a>
    Use the spearman correlation since the features are not normally distributed.

In [None]:
spearman_corr = round(df_scaled[features].corr(method = 'spearman'), 2)
spearman_corr.style.background_gradient(cmap = 'RdYlGn')

## Multiple linear regression <a class="anchor" id="multiple_linear_regression"></a>

In [None]:
response_variable = []
r2score = []
residuals = pd.DataFrame()

for i in features:
    X = df_scaled[features]
    X = X.drop(i, axis=1)
    y = df_scaled[i]
    model = sm.OLS(y, X).fit()
    y_pred = model.predict(X)
    score = r2_score(y, y_pred)
    
    residuals[i] = y - y_pred
    
    response_variable.append(i)
    r2score.append(score)

regression_results = pd.DataFrame({'Response_variable': response_variable
                                   , 'R2_Score': r2score})

regression_results.sort_values('R2_Score', ascending=False, inplace=True)
plt.figure(figsize=(5, 10))
sns.barplot(y = 'Response_variable',
            x = 'R2_Score',
            orient = 'h',
            data=regression_results)
plt.title('R2 Score For Each Feature Using The Other Features As Predictors', fontsize=18)
plt.xlabel('R2 Score')
plt.ylabel('Feature')
plt.show()

In [None]:
# Plot the distribution of the residuals
# (Multiple linear regression assumes that the residuals are normally distributed)
for i in residuals.columns:
    skew = round(residuals[i].skew(), 1)
    sns.histplot(residuals[i])
    plt.title('Distribution of Residuals: ' + str(i) + ' --- Skew ' + str(skew), fontsize=14)
    plt.show()

# Modeling <a class="anchor" id="modeling"></a>

## K-means clustering <a class="anchor" id="cluster"></a>

### Find optimal number of clusters <a class="anchor" id="elbow"></a>

In [None]:
# Use the elbow method to choose the optimal number of clusters 
X = df_scaled[features]
kmeans = KMeans(random_state=1)
kmeans_vis = KElbowVisualizer(kmeans, k=(1,15), metric='distortion', timings=False).fit(X)
print('Optimal number of clusters:', kmeans_vis.elbow_value_)

### Cluster customers <a class="anchor" id="cluster_customers"></a>

In [None]:
# Cluster the customers using the optimal number of clusters
df_scaled['CLUSTERS'] = KMeans(n_clusters=kmeans_vis.elbow_value_, random_state=1).fit_predict(X)

# Count the number of customers in each cluster
count_clusters = df_scaled.groupby('CLUSTERS').agg({'CUST_ID': 'nunique'})
customers = df_scaled['CUST_ID'].nunique()
count_clusters['Percent_of_Customers'] = count_clusters['CUST_ID']/customers
count_clusters

In [None]:
# Plot the average feature value for each cluster
describe_clusters = df_scaled.groupby('CLUSTERS').mean()

for i in features:
    sns.barplot(x = describe_clusters.index
               , y = i
               , data = describe_clusters)
    plt.title('Mean ' + str(i) + ' by Cluster')
    plt.show()

In [None]:
# Average feature value for each cluster
describe_clusters.transpose().style.background_gradient(cmap = 'RdYlGn')

## SMOTE <a class="anchor" id="SMOTE"></a>
Since the clusters are not equally sized, use the synthetic minority oversampling technique (SMOTE) to oversample the smaller clusters.

In [None]:
# SMOTE
X = df_scaled[features]
y = df_scaled['CLUSTERS']

counter = Counter(y)
print('Before SMOTE:')
print(dict(sorted(counter.items())))
print('='*50)

oversample = SMOTE(random_state = 1)
X_SMOTE, y_SMOTE = oversample.fit_resample(X, y)

counter = Counter(y_SMOTE)
print('After SMOTE:')
print(dict(sorted(counter.items())))

df_scaled_SMOTE = pd.concat([y_SMOTE, X_SMOTE], axis=1)

## Model selection <a class="anchor" id="model_selection"></a>

### Decision tree classifier <a class="anchor" id="decision_tree"></a>

In [None]:
# Separate the feature and target variables
X = df_scaled_SMOTE[features]
y = df_scaled_SMOTE['CLUSTERS']

# Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=1)

# Fit the decision tree with the training data
tree_model = DecisionTreeClassifier(random_state=1).fit(X_train, y_train)

# Predict test values
y_pred = tree_model.predict(X_test)

# Evaluate the model with accuracy on the test dataset
tree_accuracy = accuracy_score(y_test, y_pred)

# Classification report
print('Accuracy:', tree_accuracy)
print('Decision Tree Classifier')
print(classification_report(y_test, y_pred))

### Multinomial logistic regression <a class="anchor" id="logistic_regression"></a>

In [None]:
# The multinomial logistic regression model assumes that the predicters are not strongly correlated with one another.
# Use the variance inflation factor (VIF) to check for multicollinearity
X = df_scaled_SMOTE[features]

vif = pd.DataFrame()
vif["feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

# Create list of features with a VIF >= 10
high_vif = list(vif[vif["VIF"] >= 10]["feature"])

# Remove features with a high VIF
remove_high_vif = [x for x in features if x not in high_vif]

vif.sort_values("VIF", ascending=False)

In [None]:
# Multinomial logistic regression

# Separate the feature and target variables
X = df_scaled_SMOTE[remove_high_vif]
X = sm.add_constant(X)
y = df_scaled_SMOTE['CLUSTERS']

# Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=1)

# Fit the logistic model with the training data
logit_model = LogisticRegression(random_state=1, solver='liblinear').fit(X_train, y_train)

# Predict test values
y_pred = logit_model.predict(X_test)

# Evaluate the model with accuracy on the test dataset
MNLogit_accuracy = accuracy_score(y_test, y_pred)

# Classification report
print('Accuracy:', MNLogit_accuracy)
print('Multinomial Logistic Regression')
print(classification_report(y_test, y_pred))

# The decision tree is more accurate than the logistic model.

### Optimize decision tree hyperparamters

In [None]:
# Max depth of the decision tree classifier
X = df_scaled_SMOTE[features]
y = df_scaled_SMOTE['CLUSTERS']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=1)

acc = []
depth = []
for i in range(1, 16):
    tree_model = DecisionTreeClassifier(random_state=1, max_depth=i).fit(X_train, y_train)
    y_pred = tree_model.predict(X_test)
    tree_accuracy = accuracy_score(y_test, y_pred)
    acc.append(tree_accuracy)
    depth.append(i)

tree_depth = pd.DataFrame({'Max_Depth': depth
                          , 'Accuracy': acc})
sns.lineplot(x = 'Max_Depth'
            , y = 'Accuracy'
            , data = tree_depth)
plt.title('Max Depth of Decision Tree and its Accuracy', fontsize=14)

In [None]:
# The accuracy stops improving when the tree depth is greater than 10
# Fit decision tree with the optimal max depth
optimal_max_depth = 10

X = df_scaled_SMOTE[features]
y = df_scaled_SMOTE['CLUSTERS']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=1)
tree_model = DecisionTreeClassifier(random_state=1, max_depth=optimal_max_depth).fit(X_train, y_train)
y_pred = tree_model.predict(X_test)
tree_accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', tree_accuracy)

## Feature importance <a class="anchor" id="feature_importance"></a>

In [None]:
tree_importance = pd.DataFrame({'Feature': features
                               , 'Importance': tree_model.feature_importances_})

tree_importance.sort_values('Importance', ascending=False, inplace=True)

plt.figure(figsize=(5, 7))
sns.barplot(y = 'Feature',
            x = 'Importance',
            orient = 'h',
            data=tree_importance)
plt.title('Feature Importance in the Decision Tree Classifier', fontsize=14)
plt.show()

In [None]:
# PURCHASES is the most important feature, plot its distribution for each cluster
unique = sorted(df_scaled_SMOTE.CLUSTERS.unique())
palette = dict(zip(unique, sns.color_palette(n_colors=len(unique))))

sns.histplot(x = 'PURCHASES'
            , hue = 'CLUSTERS'
             , element = 'poly'
             , palette = palette
            , data = df_scaled_SMOTE)
plt.title('Distribution of PURCHASES by CLUSTER', fontsize=14)

## Visualize decision tree <a class="anchor" id="visualize"></a>

In [None]:
targets = sorted(df_scaled_SMOTE['CLUSTERS'].unique())
targets = ['Cluster_' + str(x) for x in targets]

plt.figure(figsize=(175, 25))
plot_tree(tree_model
          , feature_names = features
          , class_names = targets
          , filled = True
          , fontsize = 10)

plt.savefig('customer_segmentation_decision_tree.jpg')
plt.show()

# Summarize insights from analysis <a class="anchor" id="summary"></a>

In [None]:
def summarize_clusters(cluster):
    """Returns description of cluster"""
    counter = Counter(cluster)
    counter = dict(sorted(counter.items()))
    summary = dict()
    for i in counter.keys():
        if counter[i] >= 5400:
            summary[i] = ['Average credit card customer (each of their features are within one standard deviation from the mean)', 'Not the highest or lowest for any feature']
        elif counter[i] >= 1400:
            summary[i] = ['Lowest number of transactions made with cash in advanced'
                                , 'Lowest amount of cash in advance transactions'
                                , 'Lowest number of purchases'
                                , 'Lowest amount of purchases'
                                , 'Lowest amount purchases done in installment'
                                , 'Lowest balance frequency'
                                , 'Lowest balance amount left in their account to make purchases'
                                , 'Lowest tenure'
                                , 'Lowest credit card limit'
                                , 'Lowest amount of minimum payments']
        elif counter[i] >= 1190:
            summary[i] = ['Highest number of transactions made with cash in advanced'
                                , 'Highest amount of cash in advance transactions'
                                , 'Highest frequency of cash in advance transactions'
                                , 'High amount of minimum payments'
                                , 'Low tenure'
                                , 'Low balance frequency'
                                , 'Low purchase freqency']
        elif counter[i] >= 700:
            summary[i] = ['High number of purchases'
                                , 'High amount of purchases'
                                , 'High amount of purchases done in installment'
                                , 'High amount of one-off purchases'
                                , 'High frequency of one-off purchases'
                                , 'High amount of payments'
                                , 'High amount of full payments']
        elif counter[i] >= 30:
            summary[i] = ['Highest balance frequency'
                                , 'Highest tenure'
                                , 'Highest amount of minimum payments'
                                , 'Lowest amount of full payments'
                                , 'Lowest amount of one-off purchases'
                                , 'Lowest frequency of one-off purchases']
        else:
            summary[i] = ['Highest number of purchases'
                                , 'Highest amount of purchases'
                                , 'Highest frequency of purchases'
                                , 'Highest amount of one-off purchases'
                                , 'Highest frequency of one-off purchases'
                                , 'Highest amount of purchases done in installment'
                                , 'Highest amount of payments'
                                , 'Highest amount of full payments'
                                , 'Highest credit limit'
                                , 'Highest balance amount left in their account to make purchases'
                                , 'Lowest frequency of cash in advance transactions']
    for i in summary.keys():
        print('Cluster:', i)
        print(summary[i])
        print('-'*100)


clusters = df_scaled['CLUSTERS']
summarize_clusters(clusters)