In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
df= pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')
df.head()

In [None]:
df.info()

In [None]:
df.drop('CUST_ID', axis=1, inplace=True)

# Missing Values

In [None]:
df.isna().sum()

In [None]:
df.dropna(subset= ['CREDIT_LIMIT'], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
df['MINIMUM_PAYMENTS'].fillna(df['MINIMUM_PAYMENTS'].median(), inplace=True)

In [None]:
df.isna().sum()

# Exploratory Data Analysis

In [None]:
i=1
plt.figure(figsize= (20,40))
for col in df.columns:
    plt.subplot(9,2,i)
        
    sns.distplot(df[col])
    
    i=i+1
plt.show()

In [None]:
for col in df.columns:
    print(col)
    print({df[col].skew()})

**It seems that our data is skewed**

To deal with the skewness, we will transform it using square root method

In [None]:
i=1
plt.figure(figsize=(20,40))
for col in df.columns:
    plt.subplot(9,2,i)
    df[col]= np.sqrt(df[col])
    sns.distplot(df[col])
    i=i+1
    
plt.show()

**There is still skewness, but it is better than before**

In [None]:
df

**Plotting correlation heatmap to see if there are many co=related features**

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

**There are many co-related features**

# Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

ss= StandardScaler()
df= ss.fit_transform(df)

pca= PCA()
pca.fit(df)


In [None]:
plt.plot(pca.explained_variance_ratio_.cumsum())

In [None]:
pca= PCA(n_components=6)
X= pca.fit_transform(df)

# KMeans Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
plt.figure(figsize=(15,10))
distortions=[]
sil_scores=[]
for i in range(2,30):
    kmeans= KMeans(n_clusters=i, n_init=10, init= 'k-means++', algorithm='full', max_iter=300)
    kmeans.fit(X)
    distortions.append(kmeans.inertia_)
    label= kmeans.labels_
    sil_scores.append(silhouette_score(X, label))
plt.plot(np.arange(2,30,1), distortions, alpha=0.5)
plt.plot(np.arange(2,30,1), distortions,'o' ,alpha=0.5)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
plt.plot(np.arange(2,30,1), sil_scores)
plt.show()

**5 looks like the right number of clusters for this problem**

In [None]:
df

In [None]:
kmeans= KMeans(n_clusters=5, n_init=10, init= 'k-means++', algorithm='full', max_iter=300)
kmeans.fit(X)
labels= kmeans.labels_

In [None]:
df

**We use PCA again so that we can reduce the data to 2 components, in order to visualize our clusters better**

In [None]:
pca= PCA(n_components=2)
X2= pca.fit_transform(df)

In [None]:
pca_df = pd.DataFrame(data=X2, columns=['pca1','pca2'])
pca_df['labels']= labels
pca_df.head()

In [None]:
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x='pca1', y='pca2', hue='labels', data=pca_df, palette='bright')

**5 clusters look good. Let's try using 3 clusters also**

In [None]:
kmeans= KMeans(n_clusters=3, n_init=10, init= 'k-means++', algorithm='full', max_iter=300)
kmeans.fit(X)
labels= kmeans.labels_

In [None]:
pca= PCA(n_components=2)
X2= pca.fit_transform(df)

In [None]:
pca_df = pd.DataFrame(data=X2, columns=['pca1','pca2'])
pca_df['labels']= labels
pca_df.head()

In [None]:
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x='pca1', y='pca2', hue='labels', data=pca_df, palette='bright')

**We can group them using 3 or 5 clusters depending upon our use. Both can serve different purposes**

# Upvote and Comment if you liked my notebook :)