<a href="https://colab.research.google.com/github/sdbrgo/PERCEUL/blob/umap-hdbscan/PERCEUL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
!pip install hdbscan

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

# pipeline
from sklearn.pipeline import Pipeline

# preprocesing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin # used to define NumericSelector(), which is used in preprocessing

# dimensionality reduction
from sklearn.decomposition import PCA
from umap import UMAP

# cluster validation
from sklearn.metrics import silhouette_score

# clustering
from sklearn.cluster import KMeans
import hdbscan

# Set Up Hugging Face & Mount GDrive

In [None]:
# Hugging Face
from huggingface_hub import login
login()

# mount GDrive
from google.colab import drive
drive.mount('/content/drive')

# Import Dataset

In [None]:
ds_name = ""
df = pd.read_csv(ds_name)

# Preprocessing

In [None]:
#=====================================================================
# a custom and dynamic function for selecting numeric columns only.
# will be used to make the pipeline
class NumericSelector(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        self.numeric_cols_ = X.select_dtypes(include=[float, int]).columns
        return self

    def transform(self, X):
        X_num = X[self.numeric_cols_]
        return X_num
#=====================================================================
si = SimpleImputer(strategy='median')
df_i = si.fit_transform(df)
df_i = pd.DataFrame(df_i, columns=df.columns)

ss = StandardScaler()
df_i_ss = ss.fit_transform(df_i)
df_p1 = pd.DataFrame(df_i_ss, columns=df.columns) # will undergo cluster exploration
df_p2 = df_p1.copy() # will undergo final clustering

# Dimensionality Reduction 1

In [None]:
umap_model = UMAP(n_components=2, random_state=42)
df_p1 = umap_model.fit_transform(df_p1)

# Cluster Exploration


In [None]:
clusterer1 = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
clusterer1.fit(df_p1)

plt.figure(figsize=(10,8))
plt.scatter(df_p1.iloc[:, 0], df_p1.iloc[:, 1], c=clusterer1.labels_, cmap='Paired')
plt.title('HDBSCAN Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

# Dimensionality Reduction 2

# Final Clustering

# Cluster Interpretation

# Creating the PERCEUL Pipeline

In [None]:
# pipeline for Cluster Exploration
exploration_pipeline = Pipeline(steps = [
    ('numeric_selector', NumericSelector()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('umap_model', UMAP(n_components=2, random_state=42))
])

# pipeline for Final Clustering (Production)
core_pipeline = Pipeline(steps = [
    ('numeric_selector', NumericSelector()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2))
])