# Clustering

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_tsne = pd.read_csv('./output/df_mhc.csv',sep=',',index_col=0)

In [3]:
cols = [col for col in df_tsne.columns.values if col != "Periods"]
X = df_tsne[cols]
y = df_tsne["Periods"]

In [4]:
X = StandardScaler().fit_transform(X)

### KMeans

In [5]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
print('Silhouette Score for 4 clusters:', (silhouette_score(X, kmeans.predict(X))))

Silhouette Score for 4 clusters: 0.49663063468857294


In [6]:
X = pd.DataFrame(X,columns=['DayWeek', 'M001', 'M002', 'M003', 'M004', 'M005', 'M006', 'M007', 'M009',
                            'M010', 'M011', 'M012', 'M013', 'M014', 'M015', 'M016', 'M018', 'M020',
                            'M021', 'M022', 'M023', 'M024', 'M025', 'M026', 'M027', 'M028', 'M029',
                            'T002', 'P001', 'D001', 'D002', 'D003'])

In [7]:
X['labels'] = kmeans.labels_

In [None]:
%%time
fig, ax = plt.subplots(1, 4, figsize=(15, 7))
for i, perp in enumerate([5, 30, 50, 100]):
    tsne = TSNE(perplexity=perp)
    x_embedded = tsne.fit_transform(X)
    ax[i].scatter(x_embedded[:, 0], x_embedded[:, 1], c=X.labels, cmap='Paired')
    ax[i].set_title("Perplexity = {}".format(perp))
plt.savefig('tsne.png')
plt.show()

In [None]:
sns.set(style="ticks")
sns.pairplot(X,hue='labels')

### Agglomerative Clustering 

In [None]:
aggloclust = AgglomerativeClustering()
aggloclust.fit(X)
print('Silhouette Score for Agglomerative Clustering:', (silhouette_score(X, aggloclust.predict(X))))

In [None]:
X = pd.DataFrame(X,columns=['DayWeek', 'M001', 'M002', 'M003', 'M004', 'M005', 'M006', 'M007', 'M009',
                            'M010', 'M011', 'M012', 'M013', 'M014', 'M015', 'M016', 'M018', 'M020',
                            'M021', 'M022', 'M023', 'M024', 'M025', 'M026', 'M027', 'M028', 'M029',
                            'T002', 'P001', 'D001', 'D002', 'D003'])

In [None]:
X['labels'] = aggloclust.labels_

In [None]:
%%time
fig, ax = plt.subplots(1, 4, figsize=(15, 7))
for i, perp in enumerate([5, 30, 50, 100]):
    tsne = TSNE(perplexity=perp)
    x_embedded = tsne.fit_transform(X)
    ax[i].scatter(x_embedded[:, 0], x_embedded[:, 1], c=X.labels, cmap='Paired')
    ax[i].set_title("Perplexity = {}".format(perp))
plt.savefig('tsne.png')
plt.show()

In [None]:
sns.set(style="ticks")
sns.pairplot(X,hue='labels')