In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [1]:
from common import *

# Load data
directory_path = '../data'
output_dir='./models/clustering'
bundle_size=30
step_size=5
max_files_per_batch=5
max_samples_clustering=10000
cluster_methods=['kmeans', 'gmm']
use_gpu=True


In [2]:

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Step 1: Load and preprocess data
print("\nStep 1: Loading EEG data...")
if os.path.isdir(directory_path):
    file_dfs, _ = load_eeg_data(directory_path=directory_path)
else:
    file_dfs, _ = load_eeg_data(single_file=directory_path)

# Step 2: Preprocess and engineer features
print("\nStep 2: Preprocessing and feature engineering...")
processed_dfs = {}

# Process files in batches to save memory
file_ids = list(file_dfs.keys())

for batch_start in range(0, len(file_ids), max_files_per_batch):
    batch_end = min(batch_start + max_files_per_batch, len(file_ids))
    batch_file_ids = file_ids[batch_start:batch_end]
    
    print(f"  Processing files {batch_start+1}-{batch_end} of {len(file_ids)}")
    
    for file_id in batch_file_ids:
        # Preprocess
        df_clean = preprocess_eeg_data(file_dfs[file_id])
        
        # Engineer features
        df_features = engineer_eeg_features(df_clean)
        
        processed_dfs[file_id] = df_features
        
        # Remove original dataframe to save memory
        del file_dfs[file_id]
    
    # Force garbage collection
    gc.collect()

# Step 3: Create bundles and save to disk
print("\nStep 3: Creating time series bundles and saving to disk...")
metadata, bundle_info = create_coherent_time_series_bundles_disk(
    file_dfs=processed_dfs,
    evenly_spaced_samples=10,       # Get exactly 10 evenly spaced samples per window
    time_window_seconds=1,          # Each window is 5 seconds
    step_time_seconds=1,            # Slide window by 2.5 seconds each step
    max_memory_mb=500,              # Target maximum memory usage of 500MB
    chunk_size=100,                 # Process 100 windows at a time
    output_dir='./eeg_bundles'      # Save bundles here
)




Step 1: Loading EEG data...


  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


Loaded 30 files with 6947837 total rows

Step 2: Preprocessing and feature engineering...
  Processing files 1-5 of 30
Successfully converted timestamps. Remaining rows: 108120


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 75660


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 129036


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 308172


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

Successfully converted timestamps. Remaining rows: 291576


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


  Processing files 6-10 of 30
Successfully converted timestamps. Remaining rows: 291828


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 212280


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 288996


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 211128


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 422724


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


  Processing files 11-15 of 30
Successfully converted timestamps. Remaining rows: 552804


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

Successfully converted timestamps. Remaining rows: 359796


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 338652


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 441108


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 249084


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


  Processing files 16-20 of 30
Successfully converted timestamps. Remaining rows: 78960


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 84660


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 171408


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 203040


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 88944


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


  Processing files 21-25 of 30
Successfully converted timestamps. Remaining rows: 140976


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

Successfully converted timestamps. Remaining rows: 128532


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

Successfully converted timestamps. Remaining rows: 259068


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 235584


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 431004


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[column].fillna(median_value, inplace=True)
  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


  Processing files 26-30 of 30
Successfully converted timestamps. Remaining rows: 130008


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 116664


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 100680


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 125880


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')


Successfully converted timestamps. Remaining rows: 367668


  df_features = df_features.fillna(method='ffill').fillna(method='bfill')



Step 3: Creating time series bundles and saving to disk...

Processing files 1-5 of 30
  Processing file: museMonitor_2024-06-02--09-47-17_5812437961079996628.csv
    Creating 422 evenly spaced sample bundles from museMonitor_2024-06-02--09-47-17_5812437961079996628.csv
    Time range: 422.2 seconds, Window: 1s, Step: 1s
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array shape: (10, 83), features: 83
    Sample array s

In [None]:
# Clear memory
del processed_dfs
gc.collect()



0

In [4]:
import os
import pickle
import pandas as pd
output_dir='./models/clustering'
# Path to the directory where your bundles are stored
bundles_dir = os.path.join(output_dir, 'bundles')

# Try to load bundle_info from pickle if it exists
bundle_info_path = os.path.join(bundles_dir, 'bundle_info.pkl')
if os.path.exists(bundle_info_path):
    with open(bundle_info_path, 'rb') as f:
        bundle_info = pickle.load(f)
    print(f"Loaded bundle_info from {bundle_info_path}")
else:
    # If not found, reconstruct from metadata
    metadata_path = os.path.join(bundles_dir, 'bundle_metadata.csv')
    if not os.path.exists(metadata_path):
        metadata_path = os.path.join(bundles_dir, 'metadata.csv')  # Alternative name
    
    if os.path.exists(metadata_path):
        metadata_df = pd.read_csv(metadata_path)
        
        # Count batch files
        batch_files = [f for f in os.listdir(bundles_dir) if f.startswith('bundle_batch_') and f.endswith('.npy')]
        total_batches = len(batch_files)
        
        # Reconstruct bundle_info
        bundle_info = {
            'output_dir': bundles_dir,
            'metadata_path': metadata_path,
            'total_bundles': len(metadata_df),
            'total_batches': total_batches,
            'bundle_size': metadata_df['end_index'].iloc[0] - metadata_df['start_index'].iloc[0] + 1 if len(metadata_df) > 0 else 30,
            'step_size': metadata_df['start_index'].iloc[1] - metadata_df['start_index'].iloc[0] if len(metadata_df) > 1 else 15
        }
        print(f"Reconstructed bundle_info from metadata")
    else:
        raise FileNotFoundError(f"Could not find bundle metadata at {metadata_path}")

# Print some info to verify
print(f"Total bundles: {bundle_info['total_bundles']}")
print(f"Total batches: {bundle_info['total_batches']}")
print(f"Bundle size: {bundle_info['bundle_size']}")

Loaded bundle_info from ./models/clustering\bundles\bundle_info.pkl
Total bundles: 454644
Total batches: 459
Bundle size: 30


In [3]:
# Step 4: Normalize bundles
print("\nStep 4: Normalizing bundles on disk...")
bundle_info = normalize_bundles_disk(
    bundle_info=bundle_info,
    normalization='per_feature'
)




Step 4: Normalizing bundles on disk...

First pass: Calculating statistics...


Calculating statistics: 100%|██████████| 18/18 [00:02<00:00,  8.12it/s]



Second pass: Normalizing and saving...


Normalizing: 100%|██████████| 18/18 [00:02<00:00,  8.33it/s]


In [4]:
# Step 5: Cluster a subset of the data
print("\nStep 5: Clustering a subset of the data...")
clustering_results = memory_efficient_clustering(
    bundle_info=bundle_info,
    max_samples=400000,
    methods=cluster_methods,
    use_gpu=use_gpu
)



Step 5: Clustering a subset of the data...

Performing memory-efficient clustering using max 400000 samples...
Using 27108 bundles for clustering
Loaded batch file 0 with shape (1220, 10, 0)
Loaded batch file 1 with shape (1205, 10, 0)
Loaded batch file 2 with shape (1138, 10, 0)
Loaded batch file 3 with shape (1139, 10, 0)
Loaded batch file 4 with shape (1957, 10, 0)
Loaded batch file 5 with shape (2476, 10, 0)
Loaded batch file 6 with shape (2159, 10, 0)
Loaded batch file 7 with shape (1405, 10, 0)
Loaded batch file 8 with shape (1322, 10, 0)
Loaded batch file 9 with shape (1722, 10, 0)
Loaded batch file 10 with shape (973, 10, 0)
Loaded batch file 11 with shape (1308, 10, 0)
Loaded batch file 12 with shape (1140, 10, 0)
Loaded batch file 13 with shape (1049, 10, 0)
Loaded batch file 14 with shape (1011, 10, 0)
Loaded batch file 15 with shape (2602, 10, 0)
Loaded batch file 16 with shape (1355, 10, 0)
Loaded batch file 17 with shape (1927, 10, 0)
Data shape for clustering: (27108, 0

ValueError: Found array with 0 feature(s) (shape=(27108, 0)) while a minimum of 1 is required by KMeans.

In [10]:

# Step 6: Predict clusters for all bundles
print("\nStep 6: Predicting clusters for all bundles...")
bundle_clusters = predict_clusters_disk(
    bundle_info=bundle_info,
    clustering_results=clustering_results,
    output_file=os.path.join(output_dir, 'bundle_clusters.json')
)

# Step 7: Analyze results
print("\nStep 7: Analyzing cluster characteristics...")
best_method = max(
    clustering_results.keys(),
    key=lambda m: (
        clustering_results[m]['eval_metrics']['silhouette_score'] 
        if clustering_results[m]['eval_metrics']['silhouette_score'] is not None 
        else -1
    )
)

# Count bundles per cluster
cluster_counts = {}
for cluster in bundle_clusters.values():
    if cluster not in cluster_counts:
        cluster_counts[cluster] = 0
    cluster_counts[cluster] += 1

print(f"\nCluster distribution from {best_method}:")
for cluster, count in sorted(cluster_counts.items()):
    print(f"  Cluster {cluster}: {count} bundles ({count/len(bundle_clusters)*100:.1f}%)")

results =  bundle_info, clustering_results, bundle_clusters


Step 6: Predicting clusters for all bundles...

Using kmeans clustering model to predict all clusters
Predicting clusters for 454644 bundles...
Loaded batch file 0 with shape (1000, 30, 83)
Loaded batch file 1 with shape (1000, 30, 83)
Loaded batch file 2 with shape (1000, 30, 83)
Loaded batch file 3 with shape (1000, 30, 83)
Loaded batch file 4 with shape (1000, 30, 83)
Loaded batch file 5 with shape (1000, 30, 83)
Loaded batch file 6 with shape (1000, 30, 83)
Loaded batch file 7 with shape (1000, 30, 83)
Loaded batch file 8 with shape (1000, 30, 83)
Loaded batch file 9 with shape (1000, 30, 83)
  Processed 10000/454644 bundles
Loaded batch file 10 with shape (1000, 30, 83)
Loaded batch file 11 with shape (1000, 30, 83)
Loaded batch file 12 with shape (1000, 30, 83)
Loaded batch file 13 with shape (1000, 30, 83)
Loaded batch file 14 with shape (1000, 30, 83)
Loaded batch file 15 with shape (1000, 30, 83)
Loaded batch file 16 with shape (1000, 30, 83)
Loaded batch file 17 with shape (

In [11]:
results

({'output_dir': './models/clustering\\bundles',
  'metadata_path': './models/clustering\\bundles\\bundle_metadata.csv',
  'total_bundles': 454644,
  'total_batches': 459,
  'bundle_size': 30,
  'step_size': 15,
  'normalized_dir': './models/clustering\\bundles\\normalized',
  'normalization': 'per_feature',
  'norm_params': {'mean': array([[0.84725004, 0.75139831, 0.61285558, ..., 0.19036886, 0.01832293,
           1.00799927],
          [0.84723915, 0.75133593, 0.61281388, ..., 0.1894901 , 0.01951778,
           1.00799949],
          [0.8472173 , 0.7512255 , 0.61278104, ..., 0.18945108, 0.01966462,
           1.00800118],
          ...,
          [0.84728858, 0.75138935, 0.61285356, ..., 0.19036583, 0.01824929,
           1.0079946 ],
          [0.84724058, 0.75130768, 0.61282124, ..., 0.19044908, 0.01823341,
           1.00799505],
          [0.84716895, 0.7512297 , 0.61278041, ..., 0.19052025, 0.0182206 ,
           1.00799348]]),
   'std': array([[0.7357417 , 0.88984594, 0.6378314

In [None]:

data_path = '../data'
output_dir='./models/clustering'
bundle_size=30
step_size=15
reducer_method='umap'
use_gpu=True,
cluster_methods=['kmeans','gmm']


# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Step 1: Load the data
print("\nStep 1: Loading EEG data...")
if os.path.isdir(data_path):
    file_dfs, combined_df = load_eeg_data(directory_path=data_path)
else:
    file_dfs, combined_df = load_eeg_data(single_file=data_path)

# Step 2: Preprocess and engineer features for each file
print("\nStep 2: Preprocessing and feature engineering...")
processed_dfs = {}

for file_id, df in file_dfs.items():
    # Preprocess
    df_clean = preprocess_eeg_data(df)
    
    # Engineer features
    df_features = engineer_eeg_features(df_clean)
    
    processed_dfs[file_id] = df_features

# Step 3: Create coherent time series bundles
print("\nStep 3: Creating time series bundles...")
X_bundles, metadata = create_coherent_time_series_bundles(
    file_dfs=processed_dfs,
    bundle_size=bundle_size,
    step_size=step_size
)

In [None]:
# Step 4: Normalize bundles
print("\nStep 4: Normalizing bundles...")
X_normalized = normalize_bundles(X_bundles, normalization='per_feature')

# Step 5: Reduce dimensions for visualization
print("\nStep 5: Reducing dimensions with " + reducer_method.upper() + "...")
# Flatten time dimension by taking mean across time steps
X_flat = X_normalized.mean(axis=1) if X_normalized.ndim == 3 else X_normalized
X_reduced, reducer = reduce_dimensions(
    X_flat, method=reducer_method, n_components=2, use_gpu=use_gpu
)

# Save the reducer
try:
    joblib.dump(reducer, os.path.join(output_dir, f'{reducer_method}_reducer.joblib'))
except:
    print(f"Warning: Could not save the {reducer_method} reducer. It may not be serializable.")

# Step 6: Cluster the data
print("\nStep 6: Clustering data...")
clustering_results = cluster_data(
    X_flat, X_reduced, methods=cluster_methods, n_clusters=2, use_gpu=use_gpu
)

# Step 7: Analyze cluster characteristics
print("\nStep 7: Analyzing cluster characteristics...")
# Find the best clustering method based on silhouette score
best_method = max(
    clustering_results,
    key=lambda m: clustering_results[m]['eval_metrics']['silhouette_score'] or -1
)

print(f"\nBest clustering method: {best_method}")

# Analyze the best clustering
best_labels = clustering_results[best_method]['labels']

# Get feature names if they are in metadata
if X_normalized.ndim == 3:
    # These are generic feature names since we flattened the time dimension
    feature_names = [f'Feature_{i}' for i in range(X_flat.shape[1])]
else:
    feature_names = None

cluster_profiles = analyze_cluster_characteristics(
    X_normalized, best_labels, feature_names
)

# Save the best clustering model
try:
    joblib.dump(
        clustering_results[best_method]['clustering'],
        os.path.join(output_dir, f'{best_method}_clustering.joblib')
    )
    
    # Save labels for later use
    np.save(os.path.join(output_dir, 'cluster_labels.npy'), best_labels)
    
    # Save metadata with cluster labels
    metadata_with_clusters = metadata.copy()
    metadata_with_clusters['cluster'] = best_labels
    metadata_with_clusters.to_csv(os.path.join(output_dir, 'metadata_with_clusters.csv'), index=False)
    
except Exception as e:
    print(f"Warning: Could not save clustering results. Error: {e}")

results = {
    'X_bundles': X_bundles,
    'X_normalized': X_normalized,
    'X_reduced': X_reduced,
    'metadata': metadata,
    'clustering_results': clustering_results,
    'best_method': best_method,
    'cluster_profiles': cluster_profiles
}


In [None]:
df.head()

In [None]:
# 1. Run the unsupervised learning pipeline
results = unsupervised_learning_pipeline(df, output_dir='./models')

# 2. Later, apply clustering to new data
# new_labels = apply_clustering_to_new_data(
#     new_data,
#     model_path='models/clustering_model_kmeans.joblib',
#     feature_path='models/selected_features_unsupervised.txt',
#     reducer_path='models/umap_reducer.joblib'
# )

# 3. Analyze the distribution of clusters in new data
# print(f"Percentage in cluster 0 (likely relaxed): {(new_labels == 0).mean() * 100:.2f}%")
# print(f"Percentage in cluster 1 (likely focused): {(new_labels == 1).mean() * 100:.2f}%")