In [1]:
# Mount Drive, import libraries, read & filter data, split, and save to CSVs
from google.colab import drive
from sklearn.model_selection import train_test_split
import pandas as pd
import os

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Step 2: Dataset names and base folder path
dataset_names = ['Brown2019', 'granada', 'lynch', 'tamborlane', 'wadwa']
base_path = '/content/drive/MyDrive/Digital Health/INternalAssignment1/TrainValidationTestSplit/'

# Step 3: Loop through each dataset
for name in dataset_names:
    print(f"📁 Processing: {name}")

    # Full file path
    input_file = os.path.join(base_path, f"{name}_demographic_imputed_labeled.csv")

    # Read CSV
    df = pd.read_csv(input_file)

    # Filter out classes with < 2 samples
    class_counts = df["label"].value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    df_filtered = df[df["label"].isin(valid_classes)]

    # First split: 60% test, 40% remaining (train+val)
    train_val, test = train_test_split(
        df_filtered,
        test_size=0.6,
        random_state=42,
        stratify=df_filtered["label"]
    )

    # Re-filter train_val after split to ensure all labels still have ≥2 samples
    class_counts = train_val["label"].value_counts()
    valid_classes = class_counts[class_counts >= 2].index
    train_val_filtered = train_val[train_val["label"].isin(valid_classes)]

    # Second split: 50/50 of 40% remaining → 20% train, 20% validation
    train, validation = train_test_split(
        train_val_filtered,
        test_size=0.5,
        random_state=42,
        stratify=train_val_filtered["label"]
    )

    # Save all splits to CSV
    train.to_csv(os.path.join(base_path, f"{name.lower()}_train.csv"), index=False)
    validation.to_csv(os.path.join(base_path, f"{name.lower()}_validation.csv"), index=False)
    test.to_csv(os.path.join(base_path, f"{name.lower()}_test.csv"), index=False)

    print(f"✅ Done: {name} → train: {len(train)}, validation: {len(validation)}, test: {len(test)}")
    print("-" * 60)


Mounted at /content/drive
📁 Processing: Brown2019
✅ Done: Brown2019 → train: 29, validation: 30, test: 98
------------------------------------------------------------
📁 Processing: granada
✅ Done: granada → train: 111, validation: 111, test: 337
------------------------------------------------------------
📁 Processing: lynch
✅ Done: lynch → train: 15, validation: 16, test: 63
------------------------------------------------------------
📁 Processing: tamborlane
✅ Done: tamborlane → train: 3, validation: 3, test: 20
------------------------------------------------------------
📁 Processing: wadwa
✅ Done: wadwa → train: 15, validation: 16, test: 52
------------------------------------------------------------
