In [None]:
#!pip install ucimlrepo scikit-learn

In [1]:

import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.utils import resample

In [None]:
def load_diabetes_df():
    # Fetch dataset
    diabetes = fetch_ucirepo(id=891)

    # Extract features and target
    X = diabetes.data.features
    y = diabetes.data.targets

    # Ensure target column has a consistent name
    target_col = y.columns[0]

    # Combine into one DataFrame
    df = pd.concat([X, y], axis=1)

    # --- Handle class imbalance (downsample majority) ---
    class_counts = df[target_col].value_counts()
    majority_class = class_counts.idxmax()
    minority_class = class_counts.idxmin()

    df_majority = df[df[target_col] == majority_class]
    df_minority = df[df[target_col] == minority_class]

    # Downsample majority to match minority count
    df_majority_downsampled = resample(
        df_majority,
        replace=False, 
        n_samples=len(df_minority), 
        random_state=42
    )

    # Combine downsampled majority and minority
    df_balanced = pd.concat([df_majority_downsampled, df_minority])

    # Shuffle rows
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    # Get feature columns
    feature_cols = [col for col in df_balanced.columns if col != target_col]

    return df_balanced, feature_cols, target_col

In [3]:
df, feature_cols, target_col = load_diabetes_df()

In [5]:
df.to_csv("../data/balanced_df.csv")