In [1]:
from pathlib import Path
import os
import pandas as pd

import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score,v_measure_score
from yellowbrick.cluster import SilhouetteVisualizer
import matplotlib.pyplot as plt

CACHE_DIR = Path(Path(os.getenv('APP_DIR')), '.cache')
NANOHUB_CACHE = Path(CACHE_DIR, 'nanohub')
NANOHUB_METRICS_CACHE = Path(CACHE_DIR, 'nanohub_metrics')


def create_silhouette_visual(sample, kmeans: KMeans, clusters: list = [2,3,4,5]):
    fig, ax = plt.subplots(2, 2, figsize=(15,8))
    for i in clusters:
        q, mod = divmod(i, 2)

        visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick', ax=ax[q-1][mod])
        visualizer.fit(sample)
        
        
def display_cluster_info(X_scaled, clusters: [] = [2,3,4,5,6,7,8,9,10]):
    km_scores= []
    km_silhouette = []
    vmeasure_score =[]
    db_score = []
    
    fig, ax = plt.subplots(5, 2, figsize=(15,8))

    for i in clusters:
        kmeans = KMeans(n_clusters=i)
        kmeans.fit(X_scaled)
        preds = kmeans.predict(X_scaled)
        km_scores.append(-kmeans.score(X_scaled))
        print("Score for number of cluster(s) {}: {}".format(i,kmeans.score(X_scaled)))
        km_scores.append(-kmeans.score(X_scaled))

        silhouette = silhouette_score(X_scaled,preds)
        km_silhouette.append(silhouette)
        print("Silhouette score for number of cluster(s) {}: {}".format(i,silhouette))

        db = davies_bouldin_score(X_scaled,preds)
        db_score.append(db)
        print("Davies Bouldin score for number of cluster(s) {}: {}".format(i,db))

        display("Centroids - ")
        print(kmeans.cluster_centers_)
        
        q, mod = divmod(i, 2)
    
        visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick', ax=ax[q-1][mod])
        visualizer.fit(X_scaled)

        print("-"*100) 




In [2]:
CACHE_DIR = Path(Path(os.getenv('APP_DIR')), '.cache')
NANOHUB_CACHE = Path(CACHE_DIR, 'nanohub')
NANOHUB_METRICS_CACHE = Path(CACHE_DIR, 'nanohub_metrics')

USERS_TO_REMOVE = ['mmc', 'clarksm', 'gridstat', 'instanton']

df = pd.read_csv(
    Path(CACHE_DIR, 'unclassified_user_data.csv'),
    index_col=False
)
display(df)

FileNotFoundError: [Errno 2] No such file or directory: '/home/gekco/nanoHUB/.cache/unclassified_user_data.csv'

In [None]:
users = pd.DataFrame(df['user'].unique())
users.columns = ['user']

In [None]:
recency_df = df[['user', 'last_finish_date']].copy()
recency_df['last_finish_date'] = pd.to_datetime(recency_df['last_finish_date'])
recency_df['Recency'] = (pd.Timestamp.now().normalize() - recency_df['last_finish_date']).dt.days

users = pd.merge(users, recency_df[['user','Recency', 'last_finish_date']], on='user')

kmeans = KMeans(n_clusters=4)
kmeans.fit(users[['Recency']])
users['RecencyCluster'] = kmeans.predict(users[['Recency']])

In [None]:
display(users.groupby('RecencyCluster')['Recency'].describe())

In [None]:
users.to_csv(
    Path(CACHE_DIR, "recency.csv"),
    index=False,
    index_label=False
)

In [None]:
scaler = MinMaxScaler()
X_scaled=scaler.fit_transform(users[['Recency']])
display(X_scaled)

In [None]:
display_cluster_info(X_scaled)