In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data =pd.read_csv("../input/ccdata/CC GENERAL.csv")
data

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data = data.fillna( data.median() )
data = data.drop('CUST_ID' , axis=1)
data.isnull().sum()

In [None]:
import seaborn as sns
fig, ax = plt.subplots(figsize=(25,5))
sns.boxplot(data=data, width= 0.5,ax=ax,  fliersize=3)

In [None]:
from scipy import stats
z = np.abs(stats.zscore(data))
data_outlier_free = pd.DataFrame(data[(z < 3).all(axis=1)], columns = data.columns)

In [None]:
print(data.shape)
print(data_outlier_free.shape)

In [None]:
fig, ax = plt.subplots(figsize=(25,5))
sns.boxplot(data=data_outlier_free, width= 0.5,ax=ax,  fliersize=3)

In [None]:
s , ax = plt.subplots( figsize =( 40 , 10 ) )
cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
s = sns.heatmap( data_outlier_free.corr(), cmap = cmap,square=True, cbar_kws={ 'shrink' : .9 }, ax=ax )

In [None]:
from sklearn.preprocessing import StandardScaler, normalize
df_scaled = normalize(data_outlier_free)
df_scaled = pd.DataFrame(df_scaled, columns=data_outlier_free.columns)
df_scaled

In [None]:
from sklearn.cluster import KMeans

wcss = []
for ii in range( 1, 30 ):
    kmeans = KMeans(n_clusters=ii, init="k-means++", n_init=10, max_iter=300) 
    kmeans.fit_predict( df_scaled )
    wcss.append( kmeans.inertia_ )
    
plt.plot( wcss, 'ro-', label="WCSS")
plt.title("Computing WCSS for KMeans++")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()

In [None]:
kmeans = KMeans(n_clusters=5, init="k-means++", n_init=10, max_iter=300) 
y_pred = kmeans.fit_predict( df_scaled )

In [None]:
import seaborn as sns
df_scaled["cluster"] = y_pred
cols = list(df_scaled.columns)
sns.pairplot( df_scaled[ cols ], hue="cluster" , palette='deep')

In [None]:
best_cols = ["BALANCE", "PURCHASES", "CASH_ADVANCE","CREDIT_LIMIT", "PAYMENTS", "MINIMUM_PAYMENTS"]
kmeans = KMeans(n_clusters=5, init="k-means++", n_init=10, max_iter=300)
y_pred = kmeans.fit_predict( df_scaled[best_cols] )

df_scaled["cluster2"] = y_pred
best_cols.append("cluster2")
sns.pairplot( df_scaled[ best_cols ], hue="cluster2" , palette='deep')

The goal was to segment the customers in order to define a marketing strategy. Unfortunately the colors of the plots change when this kernel is rerun - but here are some thoughts:

**Big Spenders with large Payments** - they make expensive purchases and have a credit limit that is between average and high. This is only a small group of customers.

**Cash Advances with large Payments** - this group takes the most cash advances. They make large payments, but this appears to be a small group of customers.

**Medium Spenders with third highest Payments** - the second highest Purchases group (after the Big Spenders).

**Highest Credit Limit but Frugal** - this group doesn't make a lot of purchases. It looks like the 3rd largest group of customers.

**Cash Advances with Small Payments** - this group likes taking cash advances, but make only small payments.

**Small Spenders and Low Credit Limit** - they have the smallest Balances after the Smallest Spenders, their Credit Limit is in the bottom 3 groups, the second largest group of customers.

**Smallest Spenders and Lowest Credit Limit** - this is the group with the lowest credit limit but they don't appear to buy much. Unfortunately this appears to be the largest group of customers.

**Highest Min Payments** - this group has the highest minimum payments (which presumably refers to "Min Payment Due" on the monthly statement. This might be a reflection of the fact that they have the second lowest Credit Limit of the groups, so it looks like the bank has identified them as higher risk.)

So a marketing strategy that targeted the first five groups might be effective.

## Methode 2 with PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_outlier_free)

pca = PCA(n_components=2)
principal_comp = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(data=principal_comp,columns=['pca1','pca2'])
labels = kmeans.labels_
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis = 1)
plt.figure(figsize=(10,10))
plt.style.use('ggplot')
ax = sns.scatterplot(x="pca1", y="pca2", hue = "cluster", data = pca_df, palette =['red','green','blue','black','yellow'])
plt.show()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
kmean= KMeans(6)
kmean.fit(scaled_data)
labels=kmean.labels_
clusters=pd.concat([data, pd.DataFrame({'cluster':labels})], axis=1)

dist = 1 - cosine_similarity(scaled_data)

pca = PCA(2)
pca.fit(dist)
X_PCA = pca.transform(dist)

In [None]:
x, y = X_PCA[:, 0], X_PCA[:, 1]

colors = {0: 'red',
          1: 'blue',
          2: 'green', 
          3: 'yellow', 
          4: 'orange',  
          5:'purple'}

names = {0: 'who make all type of purchases', 
         1: 'more people with due payments', 
         2: 'who purchases mostly in installments', 
         3: 'who take more cash in advance', 
         4: 'who make expensive purchases',
         5:'who don\'t spend much money'}
  
df = pd.DataFrame({'x': x, 'y':y, 'label':labels}) 
groups = df.groupby('label')

fig, ax = plt.subplots(figsize=(20, 13)) 

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=5,
            color=colors[name],label=names[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
    ax.tick_params(axis= 'y',which='both',left='off',top='off',labelleft='off')
    
ax.legend()
ax.set_title("Customers Segmentation based on their Credit Card usage bhaviour.")
plt.show()

In [None]:
from scipy import stats
import numpy as np
df= data
z = np.abs(stats.zscore(df))
threshold = 3
print(np.where(z > 3))
df1 = df[(z < 3).all(axis=1)]
fig, ax = plt.subplots(figsize=(25,10))
sns.boxplot(data=df1, width= 0.5,ax=ax,  fliersize=3)

In [None]:
X = scaler.fit_transform(df1)
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 50)
y_kmeans = kmeans.fit_predict(X)
labels = kmeans.labels_
pca = PCA(2)
principalComponents = pca.fit_transform(X)
x, y = principalComponents[:, 0], principalComponents[:, 1]
print(principalComponents.shape)

colors = {0: 'red', 1: 'blue', 2: 'green', 3: 'yellow', 4: 'purple'}
final_df = pd.DataFrame({'x': x, 'y':y, 'label':labels}) 
groups = final_df.groupby(labels)
fig, ax = plt.subplots(figsize=(15, 10)) 

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=5, color=colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
    ax.tick_params(axis= 'y',which='both',left='off',top='off',labelleft='off')
    
ax.set_title("Customer Segmentation based on Credit Card usage")
plt.show()