In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pylab as plt
%matplotlib inline

from mpl_toolkits.mplot3d import Axes3D

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering 

from scipy.stats import zscore
from scipy import stats
from scipy.spatial.distance import cdist, pdist  #Pairwise distribution between data points
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage, fcluster

#import sklearn.metrics
# from sklearn.model_selection  import train_test_split

In [None]:
df = pd.read_excel("Credit Card Customer Data.xlsx")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

#### Checking for nul values. Based on the two statements below no null values.

In [None]:
df.isnull().sum()

In [None]:
df.isna().sum()

In [None]:
df['Total_Credit_Cards'].unique()

#### Checking for unique values in the columns.

In [None]:
for c in df.columns[2:7]:
    print(f'Column Name and unique values :  {c, df[c].unique()}')

#### Checking for duplicate data (rows). No duplicate data (rows)

In [None]:
# Check for duplicate data

dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))

#data_df[dups]

# To check for duplicates by column
#pd.concat(g for _, g in data_df.groupby("ID") if len(g) > 1).count()


In [None]:
df.equals

In [None]:
df.loc[(df['Customer Key'] == 0)].count()

#### Checking below to see if there are any quetions marks. No bad data like "?" mark

In [None]:
df.loc[(df['Customer Key'] == '?')].count()

#### Graphical analysis. Using box plots to see if you have any outliers. Two columns have outliers.
#### 1. Avg_Credit_Limit
#### 2. Total_visits_online

In [None]:
sns.boxplot(x=df['Customer Key'])   # This shows no outliers.

In [None]:
sns.boxplot(x=df['Avg_Credit_Limit'])   # This shows outliers. We see outliers, but in real life it is possible having very
# limited customers with high credit limit. It is possible to have this type of scenario in real life. Let's do Z score and see
# if it can fix thisissue, so that we do't have to worry about it. Tied Z sore and it didn't fix it.

In [None]:
df.loc[(df['Avg_Credit_Limit'] > 100000)].count()  # Lookslike 39 outliers.

In [None]:
df.loc[df['Avg_Credit_Limit'] > 100000]

In [None]:
df_z = df.apply(zscore)
sns.boxplot(x=df_z['Avg_Credit_Limit']) 
# We do see outliers with Z score also, so need to see what we can do with outliers.

In [None]:
sns.boxplot(x=df['Total_Credit_Cards'])   # This shows no outliers.

In [None]:
sns.boxplot(x=df['Total_visits_bank'])   # This shows no outliers.

In [None]:
sns.boxplot(x=df['Total_visits_online'])   # This shows outliers.

In [None]:
df.loc[(df['Total_visits_online'] > 8)].count()   # There are 37 outliers.

In [None]:
sns.boxplot(x=df['Total_calls_made'])   # This shows no outliers.

#### Trying to see how data is distributed.

In [None]:
plt.hist(df['Avg_Credit_Limit'], bins= 20, facecolor= 'tan')
plt.xlabel('Avg_Credit_Limit')
plt.ylabel('Distribution')
plt.show()

In [None]:
plt.hist(df['Total_visits_online'], bins= 15, facecolor= 'tan')
plt.xlabel('Total_visits_online')
plt.ylabel('Distribution')
plt.show()

In [None]:
plt.hist(df['Total_visits_bank'], bins= 15, facecolor= 'tan')
plt.xlabel('Total_visits_bank')
plt.ylabel('Distribution')
plt.show()

In [None]:
x = df.Total_Credit_Cards 
bandwidth = 1.06 * x.std() * x.size ** (-1 / 5.)
support = np.linspace(-1, 15, 200)

kernels = []
for x_i in x:

    kernel = stats.norm(x_i, bandwidth).pdf(support)
    kernels.append(kernel)
    plt.plot(support, kernel, color="r")

sns.rugplot(x, color=".2", linewidth=3);

In [None]:
df.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8); # ; avoid having the matplotlib verbose informations

In [None]:
sns.heatmap(df.corr(), annot=True)

## Based on above graphs we can say 
1. Avg_Credit_Limit
2. Total_visits_online both are right skewed. There are outliers in both of them.
3. There are no null values.
4. Rest of the data looks fine.
5. Even after using Z score outliers stayed. So we need to fix the outliers.
6. Avg_Credit_Limit got 39 outliers. Inreal life this is possible as very selected few customers only will have high credi limit.
7. Total_visits_online has 37outliers. 
8. If you lookat my output on cell 16, 24 outliers are matching in both columns.
9. Data also doesn't contain "?" .

## KDE plot for all the features to see how many of it can show separate clusters.

In [None]:
# Customer key seems to have minimum two clusters.
# Avg_credit_limit seems to have two clusters and data is right skewed because ofoutliers.
# Total_credit_cards and total_visits_to_the banks seems to have 4 clusters.
# Total_visits_online also seems to have 4 clusters with data being right skewed.
# 


In [None]:
for i in df.columns[df.columns!='Sl_No']:
    sns.distplot(df[i],hist=False,)
    plt.show()

In [None]:
sns.pairplot(df,diag_kind = 'kde') 

In [None]:
## If you look at these graphs. We are seeing minimum 2 clusters, some are showing 4 clusters.
## I don't see any co-relation betwen columns

## Fixing outliers in Avg_credit_limit and Total_visits_online.

In [None]:
df1 = df.copy()

In [None]:
sns.boxplot(x=df1['Avg_Credit_Limit'])  

In [None]:
sns.distplot(df1.Avg_Credit_Limit, hist=False)  # To make sure we are not creating gausian distributions while fixing outliers.

In [None]:
df2 = df.copy()
df2 = df2.drop(df2[df2.Avg_Credit_Limit > 100000].index)
df1['Avg_Credit_Limit'].values[df1['Avg_Credit_Limit'] > 100000] = df2.Avg_Credit_Limit.max()
df2.Avg_Credit_Limit.max()

In [None]:
sns.boxplot(x=df1['Avg_Credit_Limit'])  

In [None]:
sns.distplot(df1.Avg_Credit_Limit, hist=False)  # We did crete a gausian curve. ALmost got a new cluster.

In [None]:
df1.Avg_Credit_Limit.max()

In [None]:
df1 = df.copy()
df1['Avg_Credit_Limit'].values[df1['Avg_Credit_Limit'] > 100000] = df1.Avg_Credit_Limit.median()
df1.Avg_Credit_Limit.max()  # I think median is not creatinggusian curves or clusters.

In [None]:
sns.boxplot(x=df1['Avg_Credit_Limit'])  # Median is creating new outliers.

In [None]:
sns.distplot(df1.Avg_Credit_Limit, hist=False) 

In [None]:
df1 = df.copy()
df1['Avg_Credit_Limit'].values[df1['Avg_Credit_Limit'] > 100000] = df1.Avg_Credit_Limit.mode()
df1.Avg_Credit_Limit.max()  # I think median is not creatinggusian curves or clusters.

In [None]:
sns.boxplot(x=df1['Avg_Credit_Limit'])  # Median is creating new outliers.

In [None]:
df1 = df.copy()
df1['Avg_Credit_Limit'].values[df1['Avg_Credit_Limit'] > 100000] = 75000
df1.Avg_Credit_Limit.max()  # I think median is not creatinggusian curves or clusters.

In [None]:
sns.boxplot(x=df1['Avg_Credit_Limit'])  # This fixed the outlier issue.

In [None]:
sns.distplot(df1.Avg_Credit_Limit, hist=False)  # Even before wehad a gausian curve at 75000, this just increased the size.
# This should hopefully work fine.

In [None]:
sns.boxplot(x=df1['Total_visits_online'])  

In [None]:
sns.distplot(df1.Total_visits_online, hist=False)

In [None]:
df1['Total_visits_online'].values[df1['Total_visits_online'] > 8] = 8 # df1.Avg_Credit_Limit.median()
df1.Total_visits_online.max()  # I think median is not creating gausian curves or clusters.

In [None]:
sns.boxplot(x=df1['Total_visits_online'])  

In [None]:
sns.distplot(df1.Total_visits_online, hist=False)  # This created a gausian curve,

In [None]:
# df1['Total_visits_online'].values[df1['Total_visits_online'] > 8] = df1.Total_visits_online.median()
# df1.Total_visits_online.max()  # This created a gusian curves (clusters).

In [None]:
# sns.distplot(df1.Total_visits_online, hist=False)

In [None]:
# df1['Total_visits_online'].values[df1['Total_visits_online'] > 8] = df1.Total_visits_online.mode()
# df1.Total_visits_online.max()  # This created a gusian curve.

In [None]:
# sns.distplot(df1.Total_visits_online, hist=False)  # Got a gausian curve.

In [None]:
#df1['Total_visits_online'].values[df1['Total_visits_online'] > 8] = df1.Total_visits_online.mean()
#df1.Total_visits_online.max()  # I think median is not creatinggusian curves or clusters.

In [None]:
# sns.distplot(df1.Total_visits_online, hist=False) 

In [None]:
# If you look at the original looks like it got 4 clusters ( one at almost 15)
# So let's try to get only four clusters.......
# I feel when you change the outliers to 8, eventhough itcreated a gasian curve, it just moved it from 15 to 8. 
# So that should work.


In [None]:
# Se we are going ahead with equating the first one with Avg_Credit_Limit> 100000 = 75000
# Total_visits_online > 8 = 8

In [None]:
df2=df1.iloc[:,2:]
df3=df2.apply(zscore)
df3_z_gold = df3.copy()
sns.pairplot(df3,diag_kind='kde')

In [None]:
df3.shape[0]

In [None]:
df3.head()

In [None]:
df3_z_gold.head()

In [None]:
#Finding optimal no. of clusters

clusters=range(1,10)
meanDistortions=[]

for k in clusters:
    model=KMeans(n_clusters=k)
    model.fit(df3)
    prediction=model.predict(df3)
    meanDistortions.append(sum(np.min(cdist(df3, model.cluster_centers_, 'euclidean'), axis=1)) / df3.shape[0])


plt.plot(clusters, meanDistortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')

## KMeans clusteringwith 4 clusters -->

In [None]:
# Let us first start with K = 4
final_model=KMeans(4)
final_model.fit(df3)
prediction=final_model.predict(df3)

#Append the prediction 
df2["GROUP"] = prediction
df3["GROUP"] = prediction
print("Groups Assigned : \n")
df3.head()

In [None]:
df2Clust = df2.groupby(['GROUP'])
df2Clust.mean()

In [None]:
df3Clust = df3.groupby(['GROUP'])
df3Clust.mean()

In [None]:
df3.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

In [None]:
centroids = final_model.cluster_centers_

In [None]:
centroids

In [None]:
centroid_df = pd.DataFrame(centroids, columns=list(df3_z_gold))

In [None]:
centroid_df

In [None]:
final_model.labels_

In [None]:
# When we choose clusters = 4, which the elbow plot points out. It seems we got the rightclusters.
# If we look at the labels, even though 0, 1, in the end 3s came together. The clustering seems to be mixed and across the whole
# data set.

In [None]:
df2.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

In [None]:
df_labels = pd.DataFrame(final_model.labels_, columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
df_labeled = df_labels.join(df3.iloc[:,0:5])
df_labeled

In [None]:
plt.scatter(df_labeled['Avg_Credit_Limit'],df_labeled['Total_Credit_Cards'], c=final_model.labels_ )
plt.show()

In [None]:
silhouette_score(df_labeled.drop('labels',axis=1),df_labeled['labels'] )   # We gor score of 39 with four clusters.

## KMeans clusteringwith 3 clusters -->

In [None]:
# Let us try with K = 3
df3 = df3_z_gold.copy()
final_model=KMeans(3)
final_model.fit(df3)
prediction=final_model.predict(df3)

#Append the prediction 
df2["GROUP"] = prediction
df3["GROUP"] = prediction
print("Groups Assigned : \n")
df3.head()

In [None]:
df3.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

In [None]:
centroids = final_model.cluster_centers_

In [None]:
centroids

In [None]:
centroid_df = pd.DataFrame(centroids, columns=list(df3_z_gold))

In [None]:
centroid_df

In [None]:
final_model.labels_

In [None]:
final_model.labels_.shape

In [None]:
pd.DataFrame(final_model.labels_, columns = list(['labels']))

In [None]:
## 3D plots of clusters

fig = plt.figure(figsize=(8, 6))
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=20, azim=60)
k3_model=KMeans(4)
k3_model.fit(df3)
labels = k3_model.labels_
ax.scatter(df3.iloc[:, 0], df3.iloc[:, 1], df3.iloc[:, 2],c=labels.astype(np.float), edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Avg_Credit_Limit')
ax.set_ylabel('Total_Credit_Cards')
ax.set_zlabel('Total_visits_bank')
ax.set_title('Total_visits_online')

In [None]:
df4=df3.iloc[:,0:5]
df4.head()

In [None]:
df_labels = pd.DataFrame(final_model.labels_, columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
df_labeled = df_labels.join(df3.iloc[:,0:5])
df_labeled

In [None]:
plt.scatter(df_labeled['Avg_Credit_Limit'],df_labeled['Total_Credit_Cards'], c=final_model.labels_ )
plt.show()

In [None]:

silhouette_score(df_labeled.drop('labels',axis=1),df_labeled['labels'] )   # We got a score of 50 with three clusters which is
# better than the previous 39.

In [None]:
# As per the elbow curve, 4 seems to be the resonable clusters.
# When you look at box plot with clusters = 4. We do see little ovelap of Total_Credit_Cards, Total_Calls_made.
# If we look at clusters = 3 , not that much overlap.
# But the elbo plot, labels and box plots put together 4 seems to be reasonable.
# But if we lookat the silhoutte score 3 clusters got a better value.
# So finally 3 clusters seems to be GOOD.

## Hierachical clustering -->

In [None]:
df3 = df3_z_gold.copy()
df3.head()

#### Use ward as linkage metric and distance as Eucledian

In [None]:
Z = linkage(df3, 'ward', metric='euclidean')
Z.shape

In [None]:
Z[:]

In [None]:
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z = linkage(df3, 'ward')
dendrogram(Z,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()

In [None]:
# Use truncate_mode='lastp' attribute in dendrogram function to arrive at dendrogram
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=2,  # show only the last p merged clusters
)
plt.show()

In [None]:
max_d = 52
clusters = fcluster(Z, max_d, criterion='distance')
clusters

In [None]:
# Calculate Silhoutte Score for Ward linkage
silhouette_score(df3,clusters)

In [None]:
df_clusters = pd.DataFrame(data = clusters, columns= ['GROUP'])
df_clusters 
# df_labels.join(df3.iloc[:,0:5])
df4 = df3.join(df_clusters)

In [None]:
df4.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

In [None]:
# cophenet index is a measure of the correlation between the distance of points in feature space and distance on dendrogram
# closer it is to 1, the better is the clustering

Z = linkage(df3, metric='euclidean', method='ward')
c, coph_dists = cophenet(Z , pdist(df3))

c

#### Use average as linkage metric and distance as Eucledian

In [None]:
## Did with three clusters. Got score 36 and one cluster zero rows. So changed to 2.

In [None]:
df3 = df3_z_gold.copy()

In [None]:
model = AgglomerativeClustering(n_clusters=2, affinity='euclidean',  linkage='average')

In [None]:
model.fit(df3)

In [None]:
L=model.labels_
L

In [None]:
# Calculate Avg Silhoutte Score
silhouette_score(df3,L)

In [None]:
Z = linkage(df3, 'average', metric='euclidean')

In [None]:
# Use truncate_mode='lastp' attribute in dendrogram function to arrive at dendrogram
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=2,  # show only the last p merged clusters
)
plt.show()

In [None]:
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z = linkage(df3, 'average')
dendrogram(Z,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()

In [None]:
df_clusters = pd.DataFrame(data = L , columns= ['GROUP'])
df_clusters 
df4 = df3.join(df_clusters)

In [None]:
df4.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

In [None]:
# cophenet index is a measure of the correlation between the distance of points in feature space and distance on dendrogram
# closer it is to 1, the better is the clustering

Z = linkage(df3, metric='euclidean', method='average')
c, coph_dists = cophenet(Z , pdist(df3))

c

In [None]:
# This is the highest we got in the entire analysis close to 1.

#### Use complete as linkage metric and distance as Eucledian

In [None]:
df3 = df3_z_gold.copy()

In [None]:
model = AgglomerativeClustering(n_clusters=3, affinity='euclidean',  linkage='complete')

model.fit(df3)

L=model.labels_
L

In [None]:
# Calculate Avg Silhoutte Score
silhouette_score(df3,L)

In [None]:
# cophenet index is a measure of the correlation between the distance of points in feature space and distance on dendrogram
# closer it is to 1, the better is the clustering

Z = linkage(df3, metric='euclidean', method='complete')
c, coph_dists = cophenet(Z , pdist(df3))

c

In [None]:
# Z = linkage(df3, 'complete', metric='euclidean')

In [None]:
# Use truncate_mode='lastp' attribute in dendrogram function to arrive at dendrogram
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=3,  # show only the last p merged clusters
)
plt.show()

In [None]:
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z = linkage(df3, 'complete')
dendrogram(Z,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()

In [None]:
df_clusters = pd.DataFrame(data = L , columns= ['GROUP'])
df_clusters 
df4 = df3.join(df_clusters)

In [None]:
df4.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

#### Use shortest as linkage metric and distance as Eucledian

In [None]:
## Tried 3 clusters. it gave me one row in a cluster. the silhotte came aroung 31. Changed to two clusters 
## then the score went to 47.

In [None]:
df3 = df3_z_gold.copy()

model = AgglomerativeClustering(n_clusters=2, affinity='euclidean',  linkage='single')

model.fit(df3)

In [None]:
L=model.labels_
L

In [None]:
# Calculate Avg Silhoutte Score
silhouette_score(df3,L)

In [None]:
# cophenet index is a measure of the correlation between the distance of points in feature space and distance on dendrogram
# closer it is to 1, the better is the clustering

Z = linkage(df3, metric='euclidean', method='single')
c, coph_dists = cophenet(Z , pdist(df3))

c

In [None]:
#Z = linkage(df3, 'single', metric='euclidean')

In [None]:
# Use truncate_mode='lastp' attribute in dendrogram function to arrive at dendrogram
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=2,  # show only the last p merged clusters
)
plt.show()

In [None]:
plt.figure(figsize=(18, 16))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
Z = linkage(df3, 'single')
dendrogram(Z,leaf_rotation=90.0,p=5,color_threshold=52,leaf_font_size=10,truncate_mode='level')
plt.tight_layout()

In [None]:
df_clusters = pd.DataFrame(data = L , columns= ['GROUP'])
df_clusters 
df4 = df3.join(df_clusters)

In [None]:
df4.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

In [None]:
df1

### So far we did analysis after fixing outliers, results were not that great. So started afresh and did analysis without 
### fixing outliers and got better results. So this is what is used as final analysis.
### This is what is used in comparison of clusters between KMean and Hiearchical.
### Also this is what is used for comparing clusters among themselves.
### This is what is used creating recommendations for the bank.
### Please note I provided comments here. Did analysis here and also created a separate word document for analysis.
### I didn't add 25%, 50% etc., results from here in the analysis as they are self-explanatory.
### So this one, with the document together work as a complete solution.

# Trying Hierarchical clustering with Complete Linkage without modifying outliers.

In [None]:
## Let's try not modifying the outliers and see if we can get better results.
## Deleting is not good as the best customers(avg_vredit_limit pretty high will go away)
## Also we have only 660 records.
## Tried fixing outliers and didn't get any good results.
## So trying without fixing outliers......

# df_del.drop(df_del[df_del['Avg_Credit_Limit'] > 100000].index, inplace = True)
# df_del[df_del['Avg_Credit_Limit'] > 100000].count()

In [None]:
df1.head()

In [None]:
df_del = df.iloc[:,2:7]
df_del

In [None]:
df_z = df_del.apply(zscore)

In [None]:
df_z.head()

In [None]:
df_z_cp = df_z.copy()

In [None]:
model = AgglomerativeClustering(n_clusters=2, affinity='euclidean',  linkage='complete') # With three got scrore of 50. 
#  so going with 2 clusters,

In [None]:
model.fit(df_z_cp)

In [None]:
L=model.labels_
L

In [None]:
# cophenet index is a measure of the correlation between the distance of points in feature space and distance on dendrogram
# closer it is to 1, the better is the clustering
Z = linkage(df3, metric='euclidean', method='complete')
c, coph_dists = cophenet(Z , pdist(df_z_cp))

c

In [None]:
# Calculate Avg Silhoutte Score
silhouette_score(df_z_cp,L)

In [None]:
# Use truncate_mode='lastp' attribute in dendrogram function to arrive at dendrogram
dendrogram(
    Z,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=2,  # show only the last p merged clusters
)
plt.show()

In [None]:
df_clusters = pd.DataFrame(data = L , columns= ['GROUP'])
df_clusters 
df4 = df_z_cp.join(df_clusters)

In [None]:
df4.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

In [None]:
df4.head(10)

In [None]:
df_del_cp = df_del.copy()

In [None]:
df5 = df_del_cp.join(df_clusters)

In [None]:
df5.head(10)

In [None]:
df5.info()

In [None]:
df5.describe()

In [None]:
df5[df5['GROUP'] == 0].head(10)

In [None]:
df5.groupby(by = 'GROUP').min()  # Getting min values to compare clusters and for analysis

In [None]:
df5.groupby(by = 'GROUP').max() # Getting max values to compare clusters and for analysis

In [None]:
group_0 = df5[df5['GROUP'] == 0].groupby(by =  'GROUP')    # Also various details about the features for cluster = 0
group_0.describe().T

In [None]:
group_1 = df5[df5['GROUP'] == 1].groupby(by =  'GROUP')    # Also various details about the features for cluster= 1.
group_1.describe().T

In [None]:
df5[df5['GROUP'] == 0]['Avg_Credit_Limit'].min()

In [None]:
df5[df5['GROUP'] == 0]['Avg_Credit_Limit'].max()

In [None]:
col = df5.columns[0:5]
col

In [None]:
for g in 0,1:   # tried getting min, mas for all clusters using for loop
    for c in col:
        min = df5[df5['GROUP'] == g][c].min()
        max = df5[df5['GROUP'] == g][c].max()
        print(f'Group and Column , min and max values are : {g,c,min,max}')
#        print(f'Group and Column , min and max values are : {g,c,df5[df5['GROUP'] == g][c].min(),df5[df5['GROUP'] == g][c].max()}')

#### Let's try KMeans clusterring with three clusters on data without deleting ouliers as above.

In [None]:
df_z_cp = df_z.copy()    # Got a score of 41 with 2 clusters so sticking with three.

In [None]:
# Let us try with K = 3
df3 = df_z_cp.copy()
final_model=KMeans(3)
final_model.fit(df3)
prediction=final_model.predict(df3)

#Append the prediction 
df2["GROUP"] = prediction
df3["GROUP"] = prediction
print("Groups Assigned : \n")
df3.head()

In [None]:
df_labels = pd.DataFrame(final_model.labels_, columns = list(['labels']))
df_labels['labels'] = df_labels['labels'].astype('category')
df_labeled = df_labels.join(df3.iloc[:,0:5])
df_labeled

In [None]:
silhouette_score(df_labeled.drop('labels',axis=1),df_labeled['labels'] )   # We got a score of 50 with three clusters which is
# better than the previous 39.

In [None]:
df3.boxplot(by='GROUP', layout = (2,4),figsize=(15,10))

In [None]:
df_join = df_labels.join(df_del)

In [None]:
for g in 0,1,2:    # Used for loop to get min max values for all clusters.
    for c in col:
        min = df_join[df_join['labels'] == g][c].min()
        max = df_join[df_join['labels'] == g][c].max()
        print(f'Group and Column , min and max values are : {g,c,min,max}')

In [None]:
df_join.groupby(by = 'labels').min()  # Getting min value for all clusters

In [None]:
df_join.groupby(by = 'labels').max()  # Getting max values for all clusters.

In [None]:
df_join.info()   # There is a category column so the describe on the dataframe is giving some values as NAN 

In [None]:
df_join1 = df_join.copy()  # Converting the category column to integer so that describe on dataframe works fine.
cat_columns = df_join1.select_dtypes(['category']).columns
cat_columns
df_join1[cat_columns] = df_join1[cat_columns].apply(lambda x: x.cat.codes)

In [None]:
df_join1.info()  # Converted the labels column from type category to integer.

In [None]:
df_join1.describe()

In [None]:
df_join1.head(10)

In [None]:
df_join1.isna().sum()

In [None]:
df_join1[df_join1['labels'] == 0]

In [None]:
df_join1[df_join1['labels'] == 1]

In [None]:
df_join1.describe()

In [None]:
kgroup_0 = df_join1[df_join1['labels'] == 0].groupby(by =  'labels')
kgroup_0.describe().T     # Also various details about the features for cluster = 0

In [None]:
kgroup_1 = df_join1[df_join1['labels'] == 1].groupby(by =  'labels')
kgroup_1.describe().T               # Also various details about the features for cluster = 1

In [None]:
kgroup_2 = df_join1[df_join1['labels'] == 2].groupby(by =  'labels')
kgroup_2.describe().T                  # Also various details about the features for cluster = 2