## Libraries

In [None]:
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture 

from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler

from scipy.spatial.distance import cdist
from matplotlib.patches import Ellipse

sns.set()

## Reading and preprocessing

Read

In [None]:
df = pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')
df.head(3)

In [None]:
df.set_index('CUST_ID', inplace=True)
df.head(3)

Nans

In [None]:
nan_sample = df.isnull().sum().sort_values(ascending=False)
nan_sample = nan_sample[nan_sample > 0]
nan_sample

In [None]:
for i in nan_sample.index:
    df.loc[df[i].isnull(), i] = df[i].mean()

In [None]:
df.isnull().sum().sort_values(ascending=False)

Duplicated

In [None]:
df.duplicated().value_counts()

Outliers in data (log scale)

In [None]:
plt.subplots(figsize=(14, 10))
df.boxplot()
plt.yscale('log')
plt.xticks(rotation=50)
plt.show()

In [None]:
df.describe()

There are Outliers $\rightarrow$ Robust

In [None]:
scaler = RobustScaler() 
scaled = scaler.fit_transform(df.values)
scaled

## PCA

Optimal components PCA

In [None]:
pca = PCA(n_components=0.95, svd_solver='full')
pca_values = pca.fit_transform(scaled)

pca.n_components_, np.sum(pca.explained_variance_ratio_)

PCA $\leftarrow$ 2 components (for vizualize)

In [None]:
pca_values = PCA(n_components=2).fit_transform(scaled)

Error

## KMeans

In [None]:
sse = dict((k, KMeans(n_clusters=k, max_iter=10000).fit(pca_values).inertia_) for k in range(1, 20))
    
plt.bar(x=sse.keys(), height=sse.values(), width=1, edgecolor='k', facecolor='orange')
plt.plot(list(sse.keys()), list(sse.values()), 'ro-')
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

Vizualize KMeans, number of clusters $\overline{2, 9}$

In [None]:
def graph_clusters(method_name, array2d, nmin=2, nmax=9, style=plt.cm.plasma):
    
    count_axis_x = 3
    count_axis_y = (nmax - nmin + 1) // 3 + 1
    
    f = plt.figure(figsize=(count_axis_x  * 6, count_axis_y * 5))

    for i in range(nmin, nmax + 1):
        model = method_name(n_clusters=i).fit(array2d)
        f.add_subplot(count_axis_y, count_axis_x, i - 1)
        plt.scatter(array2d[:, 0], array2d[:, 1], s=10, cmap=style, c=model.labels_, label="number of\nclusters = " + str(i))
        plt.legend()

    plt.show()


graph_clusters(KMeans, pca_values, 2, 10, plt.cm.viridis)

Vizualize, numbers of clusters = 4

In [None]:
kmeans = KMeans(n_clusters=4, max_iter=1000).fit(pca_values)

plt.subplots(figsize=(10, 8))
sns.scatterplot(x="Pca1", y="Pca2", hue="Cluster", 
                     data=pd.DataFrame({'Pca1': pca_values[:, 0],
                                        'Pca2': pca_values[:, 1],
                                        'Cluster': kmeans.labels_}), palette=plt.cm.tab20, s=100, 
                     alpha=1, edgecolor='k', linewidth=1.2)

centers = kmeans.cluster_centers_

r = [cdist(pca_values[kmeans.labels_ == i], [center]).max() for i, center in enumerate(kmeans.cluster_centers_)]

plt.scatter(centers[:, 0], centers[:, 1], facecolor='green', marker='H', s=120, edgecolor='k')
for c, rad in zip(centers, r):
    plt.gcf().gca().add_artist(plt.Circle(c, rad, facecolor='darkgreen', lw=3, alpha=0.25, zorder=10))

plt.axis('equal')
plt.show()

Add cluster number and grouping by features

In [None]:
# kmeans = KMeans(n_clusters=4, max_iter=1000).fit(scaled)
data = pd.concat([df, pd.DataFrame(kmeans.labels_, columns=['Cluster'], index=df.index)], axis=1)
data = data[['Cluster'] + [col for col in data.columns if col != 'Cluster']]

for c in data.columns[1:]:
    grid = sns.FacetGrid(data, col='Cluster', height=3, aspect=1.3)
    grid.map(plt.hist, c, bins=20, edgecolor='k')
    grid.set_xticklabels(rotation=40)
    
pd.DataFrame(data['Cluster'].value_counts())

Unbalanced and indistinguishable features $\uparrow$. Sadly

## AgglomerativeClustering

Vizualize AgglomerativeClustering, number of clusters $\overline{2, 9}$ 

In [None]:
graph_clusters(AgglomerativeClustering, pca_values, 2, 10)

Vizualize, numbers of clusters = 4

In [None]:
ag = AgglomerativeClustering(n_clusters=4, 
                             affinity='euclidean', 
                             linkage='ward').fit(pca_values)

plt.subplots(figsize=(10, 8))
sns.scatterplot(x="Pca1", y="Pca2", hue="Cluster", 
                     data=pd.DataFrame({'Pca1': pca_values[:, 0],
                                        'Pca2': pca_values[:, 1],
                                        'Cluster': ag.labels_}), palette=plt.cm.tab20, s=100, 
                     alpha=1, edgecolor='k', linewidth=1.2)

plt.show()

Add cluster number and grouping by features

In [None]:
data = pd.concat([df, pd.DataFrame(ag.labels_, columns=['Cluster_ag'], index=df.index)], axis=1)
data = data[['Cluster_ag'] + [col for col in data.columns if col != 'Cluster_ag']]

for c in data.columns[1:]:
    grid = sns.FacetGrid(data, col='Cluster_ag', height=3, aspect=1.3)
    grid.map(plt.hist, c, bins=20, edgecolor='k')
    grid.set_xticklabels(rotation=40)

pd.DataFrame(data['Cluster_ag'].value_counts())

The result ~ was repeated

## GaussianMixture

Vizualize GaussianMixture, number of clusters $\overline{2, 9}$ 

In [None]:
f = plt.figure(figsize=(18, 15))

for i in range(2, 10):
    model = GaussianMixture(n_components=i).fit(pca_values)
    f.add_subplot(3, 3, i - 1)
    plt.scatter(pca_values[:, 0], pca_values[:, 1], s=10, cmap=plt.cm.magma_r, c=model.predict(pca_values), 
                label="number of\nclusters = " + str(i))
    plt.legend()

plt.show()

Draw ellipse from: https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html

In [None]:
def draw_ellipse(position, covariance, ax=None, **kwargs):
    """Draw an ellipse with a given position and covariance"""
    ax = ax or plt.gca()
    
    # Convert covariance to principal axes
    if covariance.shape == (2, 2):
        U, s, Vt = np.linalg.svd(covariance)
        angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
        width, height = 2 * np.sqrt(s)
    else:
        angle = 0
        width, height = 2 * np.sqrt(covariance)
    
    # Draw the Ellipse
    for nsig in range(1, 4):
        v = np.random.randint(255, size=3)
        rgb = plt.cm.viridis.colors
        
        ax.add_patch(Ellipse(position, nsig * width, nsig * height,
                             angle, facecolor=rgb[v[nsig - 1]], edgecolor='k', **kwargs))

In [None]:
gmm = GaussianMixture(n_components=4, init_params='kmeans', covariance_type='full').fit(pca_values)

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 18))

sns.scatterplot(x="Pca1", y="Pca2", hue="Cluster", 
                     data=pd.DataFrame({'Pca1': pca_values[:, 0],
                                        'Pca2': pca_values[:, 1],
                                        'Cluster': gmm.predict(pca_values)}), palette=plt.cm.Spectral, s=100, 
                     alpha=1, edgecolor='k', linewidth=1.2, ax=ax1)


ax2.set_yticks(ax1.get_yticks())
ax2.set_ylabel(ax1.get_ylabel())

for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
    draw_ellipse(pos, covar, alpha=0.2, ax=ax2)

# ax1.axis('equal')
# ax2.axis('equal')
f.show()

Add cluster number and grouping by features

In [None]:
data = pd.concat([df, pd.DataFrame(ag.labels_, columns=['Cluster_gmm'], index=df.index)], axis=1)
data = data[['Cluster_gmm'] + [col for col in data.columns if col != 'Cluster_gmm']]

for c in data.columns[1:]:
    grid = sns.FacetGrid(data, col='Cluster_gmm', height=3, aspect=1.3)
    grid.map(plt.hist, c, bins=20, edgecolor='k')
    grid.set_xticklabels(rotation=40)

pd.DataFrame(data['Cluster_gmm'].value_counts())

As a result, we can say that all three methods differ slightly from each other. Replacing the PCA is more likely to improve results.