In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Hello. In this notebook I tried to make a cluster modeling using the credit card dataset, which has a very interesting feature to it: Most of the data is very clumped together and a bit of data wrangling is necessary in order to have satisfying results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data=pd.read_csv("/kaggle/input/ccdata/CC GENERAL.csv")
data.head()

In [None]:
data.info()

# Removing null values and reseting index after drops

In [None]:
data=data.dropna()
data=data.reset_index().drop("index",axis=1)
data.drop("CUST_ID", axis=1, inplace=True)

In [None]:
data.info()

In [None]:
data.describe()

# In the plots below we can see the data clumping: A high number of occurences are found at the extremes of the ranges

In [None]:
for i in list(data.columns)[:-1]:
    plt.figure(figsize=(10,5))
    sns.distplot(data[i], bins=100)
    plt.show()

# Tenure is a feature that doesn't tell us much ( 85% of the customers have 12 month tenure), so we will remove this feature from the modeling

In [None]:
len(data[data["TENURE"]==12])/len(data)

# We can make a pre-clustering encoding in order to make the groups more distinctive between one another

In [None]:
#grouping variables with the same order of magnitude
v1=['BALANCE', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'CREDIT_LIMIT',
        'PAYMENTS', 'MINIMUM_PAYMENTS']
v2=['BALANCE_FREQUENCY', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY', 
         'CASH_ADVANCE_FREQUENCY', 'PRC_FULL_PAYMENT']
v3=['PURCHASES_TRX', 'CASH_ADVANCE_TRX']

In [None]:
data[v1].describe()

In [None]:
data[v2].describe()

In [None]:
data.describe()["BALANCE"]["std"]

# For the v1 and v3 groups we can see that the mean almost (and sometimes does) comprehends the 3 first quartiles of the data, indicating a great amount of outliers in the data. For this values, we will create a cutoff beyond 3 standard deviations (arbitrary value) to avoid the outliers

In [None]:
def cutoff_function(col,data,metrics):
    std_value=metrics[col]["std"]
    mean_value=metrics[col]["mean"]
    return data[data[col]<=(mean_value+3*std_value)]

In [None]:
metrics=data.describe()
for i in v1:
    data=cutoff_function(i,data,metrics)
for i in v3:
    data=cutoff_function(i,data,metrics)
data=data.reset_index().drop("index",axis=1)

In [None]:
data.info()

In [None]:
for i in list(data.columns)[:-1]:
    plt.figure(figsize=(10,5))
    sns.distplot(data[i], bins=100)
    plt.show()

In [None]:
data.describe()

# For the v1 and v3 features we are going to take the following approach: We will group in 10 ranges, and the range size is going to be equal to a half the standard deviation. Most of the values will be categorized in the first ranges, which comprehend most of the customers, and the remaining high spending customers will be separated in different groups.

In [None]:
def range_function(col,data,metrics):
    col_s=list(data[col])
    std=metrics[col]["std"]
    col_s=pd.Series(col_s/std)
    return pd.Series(col_s.apply(apply_funct))

def apply_funct(val):
    if val<=0.5:
        return 1
    elif val<=1:
        return 2
    elif val<=1.5:
        return 3
    elif val<=2:
        return 4
    elif val<=2.5:
        return 5
    elif val<=3:
        return 6
    elif val<=3.5:
        return 7
    elif val<=4:
        return 8
    elif val<=4.5:
        return 9
    elif val<=5:
        return 10
    else:
        return 11

In [None]:
metrics=data.describe()
for i in v1:
    col_name=i+"_RANGE"
    v4.append(col_name)
    data[col_name]=range_function(i,data,metrics)
for i in v3:
    col_name=i+"_RANGE"
    v5.append(col_name)
    data[col_name]=range_function(i,data,metrics)

# For the v2 group, we will simply group them in 0.1 ranges

In [None]:
def v2_range(val):
    if val<=0.1:
        return 1
    elif val<=0.2:
        return 2
    elif val<=0.3:
        return 3
    elif val<=0.4:
        return 4
    elif val<=0.5:
        return 5
    elif val<=0.6:
        return 6
    elif val<=0.7:
        return 7
    elif val<=0.8:
        return 8
    elif val<=0.9:
        return 9
    else:
        return 10

In [None]:
for i in v2:
    col_name=i+"_RANGE"
    data[col_name]=data[i].apply(v2_range)

In [None]:
data.head()

# Dropping all original features, leaving only Tenure behind alongside the new features

In [None]:
data.drop(v1+v2+v3+["TENURE"], axis=1, inplace=True)

In [None]:
data.head()

# Our data is already standardized and we can move onto the clustering methods

In [None]:
from sklearn.cluster import KMeans

# Given dataset size and amount of clusters, only KMeans will be implemented on this kernel

# We'll be using the elbow method to determine the best K value

In [None]:
inertia=[]
for i in range(1,20):
    km=KMeans(n_clusters=i)
    km.fit(data)
    inertia.append(km.inertia_)

In [None]:
plt.plot(range(1,20), inertia, "--o")
plt.title("Elbow method")
plt.xlabel("K")
plt.ylabel("Inertia")

# The elbow is around 4 or 5 depending on the user's evaluation

In [None]:
km=KMeans(n_clusters=4)
km.fit(data)
clusters=km.fit_predict(data)
data["Clusters"]=clusters

In [None]:
for column in data:
    g=sns.FacetGrid(data,col="Clusters")
    g.map(sns.distplot,column,kde=False)

In [None]:
km=KMeans(n_clusters=5)
km.fit(data)
clusters=km.fit_predict(data)
data["Clusters"]=clusters

In [None]:
for column in data:
    g=sns.FacetGrid(data,col="Clusters")
    g.map(sns.distplot,column,kde=False)

# In K=5 we can determine the customer segmentation more easily:

* Cluster 0: Customers who make a lot of low value purchases and tend to make higher value purchases with installments
* Cluster 1: High spending customers, making a lot of high value purchases (smaller cluster)
* Cluster 2: Low spending customers
* Cluster 3: Customers who don't use their credit cards so often (the wide range in balance and limit differentiate this cluster from cluster 2)
* Cluster 4: Customers who tend to make all their purchases in one-go, and tend to make less transactions

# The analysis above has considered the most proeminent features in each cluster, and a different segmentation of ranges could have improved the contrast between clusters.