In [1]:
# 01_unsupervised.ipynb

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import umap.umap_ as umap
from sklearn.cluster import KMeans, DBSCAN

# Import utilities
from setup_utils import (setup_db_connection, plr, save_intermediate_data,
                            plot_histogram, plot_scatter, plot_line, plot_correlation_matrix)

# Connect to DB and load data
engine = setup_db_connection()
vdf = pd.read_sql_query("""select * from mimiciv.mimiciv_derived.vitalsign""", engine)

# Preprocessing
cols_vitals = ['heart_rate', 'resp_rate', 'mbp', 'temperature', 'spo2']
cols_to_use_time = ['charttime','subject_id','heart_rate','resp_rate','mbp','temperature','spo2']

df = vdf[cols_to_use_time].copy()
df['charttime'] = pd.to_datetime(df['charttime'], errors='coerce')

# Aggregate daily by subject
df['charttime'] = df['charttime'].dt.date
df = df.groupby(['subject_id', 'charttime'], as_index=False)[cols_vitals].mean()

# Sample 5% of subjects for demonstration
unique_subjects = df['subject_id'].unique()
sample_size = int(len(unique_subjects)*0.05)
np.random.seed(42)
sampled_subjects = np.random.choice(unique_subjects, size=sample_size, replace=False)
df = df[df['subject_id'].isin(sampled_subjects)]
df = df.dropna()

# IQR Outlier Removal
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Scale
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[cols_vitals])

# UMAP
reducer = umap.UMAP(n_neighbors=50, min_dist=0.3, random_state=42, n_jobs=1)
embedding = reducer.fit_transform(scaled_data)
df['umap_x'] = embedding[:, 0]
df['umap_y'] = embedding[:, 1]

# KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(scaled_data)
df['kmeans_cluster'] = kmeans_labels

distances = kmeans.transform(scaled_data)
df['distance_to_center'] = [distances[i, lbl] for i, lbl in enumerate(kmeans_labels)]
threshold = np.percentile(df['distance_to_center'], 95)
df['kmeans_outlier'] = df['distance_to_center'] > threshold

# DBSCAN
dbscan = DBSCAN(eps=0.50, min_samples=10)
db_labels = dbscan.fit_predict(scaled_data)
df['dbscan_label'] = db_labels
df['dbscan_outlier'] = (db_labels == -1)

# DBSCAN on UMAP
dbscan_umap = DBSCAN(eps=0.50, min_samples=10)
umap_labels = dbscan_umap.fit_predict(embedding)
df['umap_dbscan_label'] = umap_labels
df['umap_dbscan_outlier'] = (umap_labels == -1)

def outlier_category(row):
    km = row['kmeans_outlier']
    db = row['dbscan_outlier']
    ud = row['umap_dbscan_outlier']
    outlier_methods = []
    if km:
        outlier_methods.append('KMeans')
    if db:
        outlier_methods.append('DBSCAN')
    if ud:
        outlier_methods.append('UMAP_DBSCAN')
    if len(outlier_methods) == 0:
        return 'No Outlier'
    return ' & '.join(outlier_methods)

df['outlier_combination'] = df.apply(outlier_category, axis=1)

# Save intermediate data
save_intermediate_data(df, 'data/intermediate_unsupervised.csv')
plr()
print("Unsupervised processing complete and data saved.")




@24/12/12 05:38:21
Unsupervised processing complete and data saved.
