In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data=pd.read_csv('../input/ccdata/CC GENERAL.csv')

In [None]:
data.head()

# Data Pre-processing

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.drop("CUST_ID", axis=1,inplace = True)

In [None]:
data.isna().sum()

In [None]:
data.MINIMUM_PAYMENTS.fillna(data.MINIMUM_PAYMENTS.mean(), inplace = True) 

In [None]:
data.CREDIT_LIMIT.fillna(data.CREDIT_LIMIT.mean(),inplace = True)

In [None]:
data.isna().sum()

In [None]:
data.describe()

Balance has a higher level of correlation with Cash Advance, Cash Advance Frequency and Credit Limit. Payments variable has a high correletion with Purchases and one off Purchases. Tenure has a negative correlation with Cash Advance and Cash Advance Frequency variables.

In [None]:
q1=data.quantile(0.25)
q3=data.quantile(0.75)
iqr=q3-q1
ll=q1-(1.5*iqr)
ul=q3+(1.5*iqr)

In [None]:
num=list()
for a in data.columns:
    if a in ll.index:
        num.append(a)

In [None]:
data=data[num]

In [None]:
df = data[~((data < (q1 - 1.5 * iqr)) |(data > (q3 + 1.5 * iqr))).any(axis=1)]

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
graph_by_variables = ['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
       'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
       'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE']

In [None]:
plt.figure(figsize=(20,35))

for i in range(0,17):
    plt.subplot(6, 3, i+1)
    plt.boxplot(df[graph_by_variables[i]].dropna())
    plt.title(graph_by_variables[i])

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(9,7))
sns.heatmap(df.corr(),cmap='coolwarm')

plt.title('Correlation Matrix')

1. Balance has a higher level of correlation with Cash Advance, Cash Advance Frequency and Credit Limit
2. Payments variable has a high correletion with Purchases and one off Purchases
3. Tenure has a negative correlation with Cash Advance and Cash Advance Frequency variables

In [None]:
plt.figure(figsize=(10,60))
for i in range(0,17):
    plt.subplot(17,1,i+1)
    sns.distplot(df[df.columns[i]],kde_kws={'color':'b','bw': 0.1,'lw':3,'label':'KDE'},hist_kws={'color':'r'})
    plt.title(df.columns[i])
plt.tight_layout()

1. 'Balance_Frequency' for most customers is updated frequently ~ 1
2. Very small number of customers pay their balance in full 'PRC_FULL_PAYMENT' ~ 0
3. Most customers are ~ 12 years tenure
4. For 'PURCHASES_FREQUENCY', there are two distinct group of customers
5. For 'ONEOFF_PURCHASES_FREQUENCY' and 'PURCHASES_INSTALLMENT_FREQUENCY' most users don't do one off puchases or installment purchases frequently

# KPIs

###  Monthly average purchase 

In [None]:
df['Monthly_avg_purchase']=df['PURCHASES']/df['TENURE']

In [None]:
print(df['Monthly_avg_purchase'].head(),'\n ',
df['TENURE'].head(),'\n', df['PURCHASES'].head())

### Cash advance amount

In [None]:
df['Monthly_cash_advance']= df['CASH_ADVANCE'] / df['TENURE']

In [None]:
df[df['ONEOFF_PURCHASES']==0]['ONEOFF_PURCHASES'].count()

### Purchases by type (one-off, installments)

In [None]:
df.loc[:,['ONEOFF_PURCHASES','INSTALLMENTS_PURCHASES']]

In [None]:
def purchase(df):
    if (df['ONEOFF_PURCHASES']==0) & (df['INSTALLMENTS_PURCHASES']==0):
        return 'none'
    if (df['ONEOFF_PURCHASES']>0) & (df['INSTALLMENTS_PURCHASES']>0):
         return 'both_oneoff_installment'
    if (df['ONEOFF_PURCHASES']>0) & (df['INSTALLMENTS_PURCHASES']==0):
        return 'one_off'
    if (df['ONEOFF_PURCHASES']==0) & (df['INSTALLMENTS_PURCHASES']>0):
        return 'installment'

In [None]:
df['purchase_type']=df.apply(purchase, axis=1)

In [None]:
df['purchase_type'].value_counts()

### Limit usage (balance to credit limit ratio)

In [None]:
df['limit_usage'] = df.apply(lambda x: x['BALANCE']/x['CREDIT_LIMIT'], axis=1)

In [None]:
df['limit_usage'].head()

### Payments to minimum payments ratio etc (think of more types of similar analysis)

In [None]:
df['PAYMENTS'].isnull().any()
df['MINIMUM_PAYMENTS'].isnull().value_counts()

In [None]:
df['MINIMUM_PAYMENTS'].describe()

In [None]:
df['payment_minpay']= df.apply(lambda x:x['PAYMENTS']/x['MINIMUM_PAYMENTS'],axis=1)

In [None]:
df['payment_minpay']

## Gain insight on the customer profiles

In [None]:
df.groupby('purchase_type').apply(lambda x: np.mean(x['limit_usage'])).plot.barh()

Customers which don't do either one-off or installment purchases take more cash on advance

In [None]:
df.groupby('purchase_type').apply(lambda x: np.mean(x['Monthly_cash_advance'])).plot.barh()
plt.title('Average cash advance taken by customers of different Purchase type : Both, None,Installment,One_Off')

Customers with installment purchases are paying dues

In [None]:
x=df.groupby('purchase_type').apply(lambda x: np.mean(x['payment_minpay']))
type(x)
x.values

In [None]:
fig,ax=plt.subplots()
ax.barh(y=range(len(x)), width=x.values,align='center')
ax.set(yticks= np.arange(len(x)),yticklabels = x.index);
plt.title('Mean payment_minpayment ratio for each purchse type')

Average payment_minpayment ratio for each purchse type

# Clustering

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import sklearn.cluster as cluster
from sklearn.decomposition import FactorAnalysis

In [None]:
scaler = StandardScaler()
df_std=pd.DataFrame(scaler.fit_transform(data))

In [None]:
for i in range(len(data.columns)):
    df_std.rename(columns={i:data.columns[i]},inplace=True)

In [None]:
df_std.head()

In [None]:
transformer = FactorAnalysis(n_components=10, random_state=0)

In [None]:
transformer.fit(df_std)

In [None]:
components_df=pd.DataFrame(transformer.components_)
for i in range(len(data.columns)):
    components_df.rename(columns={i:data.columns[i]},inplace=True)

In [None]:
plt.figure(figsize=(10,8))
components_df.loc[0].plot(kind="bar")
plt.grid(True)

In [None]:
components_df.loc[0].sort_values()

In [None]:
cluster_df=data[['PURCHASES','ONEOFF_PURCHASES','PURCHASES_TRX','ONEOFF_PURCHASES_FREQUENCY','INSTALLMENTS_PURCHASES','PURCHASES_FREQUENCY','PURCHASES_INSTALLMENTS_FREQUENCY','CREDIT_LIMIT','PAYMENTS','CASH_ADVANCE_FREQUENCY']]

In [None]:
cluster_df.head()

In [None]:
distortions = []
K = range(1,30)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(cluster_df)
    distortions.append(kmeanModel.inertia_)
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

Let's choose n=8 clusters

In [None]:
km = KMeans(init="random", n_clusters=8)
y_pred = km.fit(cluster_df)

In [None]:
labels=km.labels_

In [None]:
labels=labels.tolist()

In [None]:
labels=pd.Series(data=labels,index=range(len(labels)))

In [None]:
labels_x=list()
for i in range(10):
    labels_x.append(labels[labels.values==i])

### Cluster 1

In [None]:
cluster_df.iloc[labels_x[0].index,:].describe().T

### Cluster 2

In [None]:
cluster_df.iloc[labels_x[1].index,:].describe().T

### Cluster 3

In [None]:
cluster_df.iloc[labels_x[2].index,:].describe().T

### Cluster 4

In [None]:
cluster_df.iloc[labels_x[3].index,:].describe().T

### Cluster 5

In [None]:
cluster_df.iloc[labels_x[4].index,:].describe().T

### Cluster 6

In [None]:
cluster_df.iloc[labels_x[5].index,:].describe().T

### Cluster 7

In [None]:
cluster_df.iloc[labels_x[6].index,:].describe().T

### Cluster 8

In [None]:
cluster_df.iloc[labels_x[7].index,:].describe().T

In [None]:
data["cluster"] = y_pred
cols = list(data.columns)

sns.pairplot( data[ cols ], hue="cluster",diag_kws={'bw': 0.2})

In [None]:
best_cols = ["BALANCE", "PURCHASES", "CASH_ADVANCE","CREDIT_LIMIT", "PAYMENTS", "MINIMUM_PAYMENTS"]
kmeans = KMeans(n_clusters=8, init="k-means++", n_init=10, max_iter=300) 
best_vals = data[best_cols].iloc[ :, 1:].values
y_pred = kmeans.fit_predict( best_vals )

data["cluster"] = y_pred
best_cols.append("cluster")
sns.pairplot( data[ best_cols ], hue="cluster", diag_kws={'bw': 0.2})

1. Big Spenders with large Payments - they make expensive purchases and have a credit limit that is between average and high. This is only a small group of customers
2. Cash Advances with large Payments - this group takes the most cash advances. They make large payments, but this appears to be a small group of customers
3. Medium Spenders with third highest Payments - the second highest Purchases group
4. Highest Credit Limit but Frugal - this group doesn't make a lot of purchases. It looks like the 3rd largest group of customers
5. Cash Advances with Small Payments - this group likes taking cash advances, but make only small payments
6. Small Spenders and Low Credit Limit - they have the smallest Balances after the Smallest Spenders, their Credit Limit is in the bottom 3 groups, the second largest group of customers
7. Smallest Spenders and Lowest Credit Limit - this is the group with the lowest credit limit but they don't appear to buy much. Unfortunately this appears to be the largest group of customers
8. Highest Min Payments - this group has the highest minimum payments

So a marketing strategy that targeted the first five groups might be effective.