# Step 1 (Importing Libraries and Data)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
credit_df = pd.read_csv("/kaggle/input/ccdata/CC GENERAL.csv")
df = credit_df.copy()
credit_df.head()

# Step 2: EDA

In [None]:
credit_df.info()

In [None]:
print(credit_df.shape)

### We can see that attributes CREDIT_LIMIT and MINIMUM_PAYMENTS are missing values. Lets look at same.

In [None]:
print(credit_df.isna().sum()) # To Check the no. of unavailable instances.

### Let's first check out the distribution of CREDIT_LIMIT and MINIMUM_PAYMENTS to get an idea which method to use for filling null values.

In [None]:
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
sns.distplot(credit_df.MINIMUM_PAYMENTS.dropna(), color='#fdc029')
plt.subplot(1,2,2)
sns.distplot(credit_df.CREDIT_LIMIT.dropna(), color='#fdc029')
plt.show()

### We can see that distribution is skewed so we will use Median to fill these values.

In [None]:
credit_df['MINIMUM_PAYMENTS'].fillna(credit_df['MINIMUM_PAYMENTS'].mean(), inplace=True)
credit_df['CREDIT_LIMIT'].fillna(credit_df['CREDIT_LIMIT'].mean(), inplace=True)

print(credit_df[['MINIMUM_PAYMENTS','CREDIT_LIMIT']].isna().sum())

##### Now we are set with missing Values.

In [None]:
credit_df.describe().T

In [None]:
plt.figure(figsize=(45,7))
sns.boxplot(data=credit_df)
plt.show()

#### When we observe the boxplot we find the presence of many outliers but it's better not to handle outliers because we want to analyse all types of customers so better not to handle them.

In [None]:
credit_df.columns

### Let's derive some KPI's
     1)  Monthly average purchase->(month_avg_purchase)-> PURCHASE/(PURCHASE_FREQUENCY*TENURE)
     2)  Cash advance amount->(cash_advance_amt)-> CASH_ADVANCE/(PURCHASE_FREQUENCY*TENURE)

In [None]:
credit_df['month_avg_purchase'] = credit_df['PURCHASES']/(credit_df['TENURE'])
credit_df['cash_advance_amt'] = credit_df['CASH_ADVANCE']/(credit_df['TENURE'])
credit_df[['month_avg_purchase','cash_advance_amt']].head()


###### 3) Purchase_type
    In data there are two type of purchases i.e oneoff_purchase and installments_purchase. We will check if there is any relation between these two fields. 

In [None]:
plt.figure(figsize=(10,4))
sns.lineplot(credit_df['BALANCE'],credit_df['ONEOFF_PURCHASES'],label='Oneoff')
sns.lineplot(credit_df['BALANCE'],credit_df['INSTALLMENTS_PURCHASES'],label="Installment")
plt.show()

In [None]:
print(credit_df[(credit_df['ONEOFF_PURCHASES'] == 0) & (credit_df['INSTALLMENTS_PURCHASES']==0)].shape)
print(credit_df[(credit_df['ONEOFF_PURCHASES']==0) & (credit_df['INSTALLMENTS_PURCHASES']>0)].shape)
print(credit_df[(credit_df['ONEOFF_PURCHASES']>0) & (credit_df['INSTALLMENTS_PURCHASES']==0)].shape)
print(credit_df[(credit_df['ONEOFF_PURCHASES']>0) & (credit_df['INSTALLMENTS_PURCHASES']>0)].shape)

###### When we add all these rows we get is 8950 i.e total no. of rows so we can infer following things:
        1) Customer prefering One-Off Purchases.
        2) Customer prefering Installments Purchases.
        3) Customers preferring both.
        4) Customers preferring none.

In [None]:
def purchase_by_type(credit_df):
    if (credit_df['ONEOFF_PURCHASES']==0) & (credit_df['INSTALLMENTS_PURCHASES']==0):
        return 'none'
    if (credit_df['ONEOFF_PURCHASES']>0) & (credit_df['INSTALLMENTS_PURCHASES']>0):
         return 'dual'
    if (credit_df['ONEOFF_PURCHASES']>0) & (credit_df['INSTALLMENTS_PURCHASES']==0):
        return 'oneoff'
    if (credit_df['ONEOFF_PURCHASES']==0) & (credit_df['INSTALLMENTS_PURCHASES']>0):
        return 'installment'

credit_df['purchase_by_type'] = credit_df.apply(purchase_by_type,axis=1)
credit_df.head()

In [None]:
credit_df['limit_usage'] = credit_df['BALANCE']/credit_df['CREDIT_LIMIT']
credit_df['minimum_payment'] = credit_df['PAYMENTS']/credit_df['MINIMUM_PAYMENTS']
credit_df[['limit_usage','minimum_payment']].head()

In [None]:
credit_df.drop(['CUST_ID'],axis=1,inplace=True)

### As we need to define marketing strategy, let's think how a bank will get  benefits. I believe a bank makes money from roping people in for longer amounts of time as well as making purchases. I think purchases, balance, and payments are the best play because they exemplify the baseline services of a bank (getting people to have more with the bank, pay them more, and spend more overall to have to pay back) without those key problems.

In [None]:
def scatter_tenure(field):
    plt.figure(figsize=(8,4))
    sns.lineplot(x='TENURE',y=field,data=credit_df)
    plt.show()
scatter_tenure('PURCHASES')
sns.lineplot(x='TENURE',y='PURCHASES',data=credit_df)

##### We can see that as tenure increases the purchase amount also increases.

In [None]:
scatter_tenure('BALANCE')
scatter_tenure('PAYMENTS')

##### Same phenomena is observed here.

In [None]:
ratio = credit_df.groupby('purchase_by_type').apply(lambda x: np.mean(x['minimum_payment']))
ratio.values

In [None]:
fig,ax=plt.subplots()
ax.barh(y=range(len(ratio)),width=ratio.values)
ax.set(yticks=np.arange(len(ratio)),yticklabels=ratio.index)
plt.title("Minimum_payment ratio for each purchase type")
plt.show()

In [None]:
sns.barplot(y='CREDIT_LIMIT',x='purchase_by_type',data=credit_df)
plt.show()
sns.lineplot(y='CREDIT_LIMIT',x='purchase_by_type',data=credit_df,color='black')
plt.show()

##### Customers who do installment purchases have good credit scores.

In [None]:
sns.barplot(x='purchase_by_type',y='CASH_ADVANCE',data=credit_df)
plt.show()

##### Customers who don't dealt with either  installment or oneoff purchases generally takes more cash in advance.

## 3 Step (Transformation)

In [None]:
credit_df.dtypes

#### As, only 'purchase_by_type' field is of object type so we will create dummy variable for it.

In [None]:
credit_df=pd.get_dummies(credit_df)
credit_df.head()

In [None]:
sns.heatmap(credit_df.corr())

### Now as collinearity between variables is too much we will be using PCA to trim down some attributes.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
#Standarize Data
scaler = StandardScaler()
scaler_credit_df = scaler.fit_transform(credit_df)

In [None]:
from sklearn.decomposition import PCA
ratio_arr = {}
for i in range(4,15):
    pca=PCA(n_components=i)
    pca_data = pca.fit(scaler_credit_df)
    ratio_arr[i]=sum(pca_data.explained_variance_ratio_)
print(ratio_arr)
pd.Series(ratio_arr).plot()

#### We can see that 9 components are explaining more than 80% variance so we select 9 components.

In [None]:
pc = PCA(n_components=9).fit(scaler_credit_df)
pca_reduced_final = pc.fit_transform(scaler_credit_df)

final_df = pd.DataFrame(pca_reduced_final)
print(final_df.shape)
col_list = credit_df.columns
col_list

#pc.explained_variance_ratio_


In [None]:
pd.DataFrame(pc.components_.T,columns=['Component_'+str(i) for i in range(9)],index=col_list)

#### We will be using Kmeans algo. for clustering and be using Elbow method for verifying number of clusters.

In [None]:
from sklearn.metrics import silhouette_score
n_clusters = 10
cost = []


for i in range(1,10):
    kmeans = KMeans(i)
    kmeans.fit(pca_reduced_final)
    cost.append(kmeans.inertia_)
plt.plot(cost,'bx-')
plt.show()

#### Elbow point is at 5. So cluster choose is 5

In [None]:
score=0
km = KMeans(5)
km.fit(pca_reduced_final)
score=silhouette_score(pca_reduced_final,km.labels_)

In [None]:
score

In [None]:
col_kpi=['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
       'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
       'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
       'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
       'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
       'TENURE', 'month_avg_purchase', 'cash_advance_amt', 'limit_usage',
       'minimum_payment', 'purchase_by_type_dual',
       'purchase_by_type_installment', 'purchase_by_type_none',
       'purchase_by_type_oneoff']

#### Here we see that number of clusters suggested is 5 . The PCA transformation provides us better results. So we are creating a pipeline with Standard Scaler, PCA and KMeans together.

In [None]:
from sklearn.pipeline import Pipeline

preprocess = Pipeline(
    [("scaler",StandardScaler()),
    ("pca",PCA(n_components=9,random_state=42))
    ]
)

cluster = Pipeline(
    [
        ('km',KMeans(n_clusters=5,init='k-means++',n_init=50,random_state=42))
    ]
)

pipe = Pipeline(
    [
        ("preprocess",preprocess),
        ("cluster",cluster)
    ]
)

pipe.fit(credit_df)

In [None]:
preprocessed_data = pipe['preprocess'].transform(credit_df)
predicted_labels = pipe['cluster']['km'].labels_
silhouette_score(preprocessed_data,predicted_labels)

### N_components=9 and Cluster=5

In [None]:
cluster_df = pd.concat([credit_df[col_kpi],pd.Series(pipe['cluster']['km'].labels_,name="Cluster")],axis=1)
cluster_df.head()

In [None]:
five_cluster=cluster_df.groupby('Cluster').apply(lambda x: x[col_kpi].mean()).T
five_cluster

In [None]:
fig,ax=plt.subplots(figsize=(15,10))
index=np.arange(len(five_cluster.columns))

cash_advance=np.log(five_cluster.loc['cash_advance_amt',:].values)
credit_score=(five_cluster.loc['limit_usage',:].values)
purchase= np.log(five_cluster.loc['month_avg_purchase',:].values)
payment=five_cluster.loc['minimum_payment',:].values
installment=five_cluster.loc['purchase_by_type_installment',:].values
one_off=five_cluster.loc['purchase_by_type_oneoff',:].values


bar_width=.10
b1=plt.bar(index,cash_advance,color='b',label='cash_advance_amt',width=bar_width)
b2=plt.bar(index+bar_width,credit_score,color='m',label='Credit_score',width=bar_width)
b3=plt.bar(index+2*bar_width,purchase,color='k',label='Avg purchase',width=bar_width)
b4=plt.bar(index+3*bar_width,payment,color='c',label='Payment-minpayment ratio',width=bar_width)
b5=plt.bar(index+4*bar_width,installment,color='r',label='installment',width=bar_width)
b6=plt.bar(index+5*bar_width,one_off,color='g',label='One_off purchase',width=bar_width)

plt.xlabel("Cluster")
plt.title("Insights")
plt.xticks(index + bar_width, ('Cl-0', 'Cl-1', 'Cl-2', 'Cl-3','Cl-4'))

plt.legend()

### Cluster0:
    This group is performing best among all as cutomers are maintaining good credit score and paying dues on time. -- Giving rewards point will make them perform more purchases.
### Cluster1:
    This group is a risky group as percent of full payment paid by user is not much.
### Cluster2:
    This group has minimum paying ratio and using card for both transactions.Also, have highest cash advance amount. This group is a risky group.
### Cluster3:
    They are potential customers who are paying dues and doing purchases and maintaining comparatively good credit score.
### Cluster4:
    This group of users have maintained good credit score can be lured by offering more incentives for the installment purchases.