In [1]:
from pathlib import Path
import os
import pandas as pd

import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score,v_measure_score
from yellowbrick.cluster import SilhouetteVisualizer
import matplotlib.pyplot as plt

CACHE_DIR = Path(Path(os.getenv('APP_DIR')), '.cache')
NANOHUB_CACHE = Path(CACHE_DIR, 'nanohub')
NANOHUB_METRICS_CACHE = Path(CACHE_DIR, 'nanohub_metrics')


def create_silhouette_visual(sample, kmeans: KMeans, clusters: list = [2,3,4,5]):
    fig, ax = plt.subplots(2, 2, figsize=(15,8))
    for i in clusters:
        q, mod = divmod(i, 2)

        visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick', ax=ax[q-1][mod])
        visualizer.fit(sample)
        
        
def display_cluster_info(X_scaled, clusters: [] = [2,3,4,5,6,7,8,9,10]):
    km_scores= []
    km_silhouette = []
    vmeasure_score =[]
    db_score = []
    
    fig, ax = plt.subplots(5, 2, figsize=(15,8))

    for i in clusters:
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(X_scaled)
        preds = kmeans.predict(X_scaled)
        km_scores.append(-kmeans.score(X_scaled))
        print("Score for number of cluster(s) {}: {}".format(i,kmeans.score(X_scaled)))
        km_scores.append(-kmeans.score(X_scaled))

        silhouette = silhouette_score(X_scaled,preds)
        km_silhouette.append(silhouette)
        print("Silhouette score for number of cluster(s) {}: {}".format(i,silhouette))

        db = davies_bouldin_score(X_scaled,preds)
        db_score.append(db)
        print("Davies Bouldin score for number of cluster(s) {}: {}".format(i,db))

        display("Centroids - ")
        print(kmeans.cluster_centers_)
        
        q, mod = divmod(i, 2)
    
        visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick', ax=ax[q-1][mod])
        visualizer.fit(X_scaled)

        print("-"*100) 



In [2]:
df = pd.read_csv(
    Path(CACHE_DIR, 'unclassified_user_data.csv'),
    index_col=False
)

In [3]:
users = pd.DataFrame(df['user'].unique())
users.columns = ['user']

In [4]:
monetary_df = df[['user', 'job_count']].copy()
monetary_df['Monetary'] = monetary_df['job_count']

users = pd.merge(users, monetary_df, on='user')

In [5]:

kmeans = KMeans(n_clusters=8)
kmeans.fit(users[['Monetary']])
users['MonetaryCluster'] = kmeans.predict(users[['Monetary']])

In [6]:
display(users.groupby('MonetaryCluster')['job_count'].describe())

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
MonetaryCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,36019.0,330.8406,1141.794981,2.0,10.0,25.0,95.0,11869.0
1,1.0,8942496.0,,8942496.0,8942496.0,8942496.0,8942496.0,8942496.0
2,3.0,1292057.0,78821.036052,1201055.0,1268639.5,1336224.0,1337557.5,1338891.0
3,11.0,261431.5,49594.40484,203289.0,229217.0,242015.0,283512.0,361523.0
4,33.0,112823.0,27871.14014,70478.0,84326.0,118089.0,134210.0,170888.0
5,7.0,503236.9,103539.557393,408693.0,447209.5,456760.0,535614.5,691557.0
6,463.0,23458.38,11482.748141,11970.0,14945.0,19115.0,28594.5,65359.0
7,1.0,1585184.0,,1585184.0,1585184.0,1585184.0,1585184.0,1585184.0


In [7]:
users.to_csv(
    Path(CACHE_DIR, "monetary.csv"),
    index=False,
    index_label=False
)

In [8]:
scaler = MinMaxScaler()
X_scaled=scaler.fit_transform(users[['Monetary']])

In [None]:
display_cluster_info(X_scaled)

Score for number of cluster(s) 2: -0.13653138165288012
Silhouette score for number of cluster(s) 2: 0.9997457019946906
Davies Bouldin score for number of cluster(s) 2: 0.0001977298740985461


'Centroids - '

[[1.17183604e-04]
 [1.00000000e+00]]
----------------------------------------------------------------------------------------------------
Score for number of cluster(s) 3: -0.04153968472884043
Silhouette score for number of cluster(s) 3: 0.9983732993507506
Davies Bouldin score for number of cluster(s) 3: 0.20339215106435704


'Centroids - '

[[9.65192790e-05]
 [1.00000000e+00]
 [1.25931927e-01]]
----------------------------------------------------------------------------------------------------
Score for number of cluster(s) 4: -0.015108412961479235
Silhouette score for number of cluster(s) 4: 0.9957013134801992
Davies Bouldin score for number of cluster(s) 4: 0.2240664159429704


'Centroids - '

[[8.09343359e-05]
 [1.00000000e+00]
 [1.52679610e-01]
 [3.97500903e-02]]
----------------------------------------------------------------------------------------------------
Score for number of cluster(s) 5: -0.00856090976041935
Silhouette score for number of cluster(s) 5: 0.9909895236798973
Davies Bouldin score for number of cluster(s) 5: 0.268529485184249


'Centroids - '

[[7.06923783e-05]
 [1.00000000e+00]
 [1.52679610e-01]
 [1.67724750e-02]
 [5.23205023e-02]]
----------------------------------------------------------------------------------------------------
Score for number of cluster(s) 6: -0.005215387264278033
Silhouette score for number of cluster(s) 6: 0.9668109342187621
Davies Bouldin score for number of cluster(s) 6: 0.3217432283548465


'Centroids - '

[[3.93500146e-05]
 [1.00000000e+00]
 [1.52679610e-01]
 [5.23205023e-02]
 [1.84304401e-02]
 [3.00744253e-03]]
----------------------------------------------------------------------------------------------------
Score for number of cluster(s) 7: -0.0037329547056020283
Silhouette score for number of cluster(s) 7: 0.964453118697194
Davies Bouldin score for number of cluster(s) 7: 0.35685132541326203


'Centroids - '

[[3.68089388e-05]
 [1.00000000e+00]
 [1.52679610e-01]
 [2.92345229e-02]
 [5.62745535e-02]
 [2.62580514e-03]
 [1.26162825e-02]]
