# Demo on Stratified Subset of Original Dataset

The notebook reads `data/obesity.csv`, creates a stratified subset, runs feature engineering and models on that subset, and writes all outputs to `demo_data/` and `demo_results/`. Original `data/` and `results/` are not modified.

In [1]:
# Parameters
USE_SUBSET = True            # Set False to cancel (safety)
SUBSET_SIZE = 1500            # number of samples in subset
RANDOM_STATE = 42
ORIG_DATA_PATH = 'data/obesity.csv'
DEMO_DATA_DIR = 'demo_data'
DEMO_RESULTS_DIR = 'demo_results'
import os
os.makedirs(DEMO_DATA_DIR, exist_ok=True)
os.makedirs(DEMO_RESULTS_DIR, exist_ok=True)
print('Demo dirs:', DEMO_DATA_DIR, DEMO_RESULTS_DIR)

Demo dirs: demo_data demo_results


In [2]:
# Create stratified subset from original dataset
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
if not os.path.exists(ORIG_DATA_PATH):
    raise FileNotFoundError(f'Original dataset not found at {ORIG_DATA_PATH}')
orig = pd.read_csv(ORIG_DATA_PATH)
print('Original shape:', orig.shape)
target_col = 'NObeyesdad'
if target_col not in orig.columns:
    raise ValueError(f"Target column '{target_col}' missing.")
if not USE_SUBSET:
    raise RuntimeError('USE_SUBSET is False; aborting to avoid accidental full runs.')
if SUBSET_SIZE >= len(orig):
    subset = orig.copy()
    print('SUBSET_SIZE >= original size; using full dataset as subset.')
else:
    # stratified split to get the subset
    _, subset = train_test_split(orig, train_size=SUBSET_SIZE, stratify=orig[target_col], random_state=RANDOM_STATE)
subset = subset.reset_index(drop=True)
subset_path = os.path.join(DEMO_DATA_DIR, 'obesity_subset.csv')
subset.to_csv(subset_path, index=False)
print('Saved subset to', subset_path, 'shape:', subset.shape)

Original shape: (2111, 17)
Saved subset to demo_data\obesity_subset.csv shape: (611, 17)


## Feature Engineering on subset (writes to demo_data/)

In [5]:
# Feature engineering on the subset
import pandas as pd, numpy as np, os, joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
subset_path = os.path.join(DEMO_DATA_DIR, 'obesity_subset.csv')
df = pd.read_csv(subset_path)
target_col = 'NObeyesdad'
y = df[target_col]; X = df.drop(columns=[target_col])
# Derived features
if 'Height' in X.columns and 'Weight' in X.columns:
    X = X.copy(); X['Height_m'] = X['Height']/100; X['BMI'] = X['Weight']/(X['Height_m']**2)
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
# label encode target
le = LabelEncoder(); y_train = le.fit_transform(y_train_raw); y_test = le.transform(y_test_raw)
os.makedirs(os.path.join(DEMO_DATA_DIR,'artifacts'), exist_ok=True); joblib.dump(le, os.path.join(DEMO_DATA_DIR,'artifacts','label_encoder_target.joblib'))
# one-hot encode
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
if len(categorical_cols)>0:
    X_train_cat = ohe.fit_transform(X_train_raw[categorical_cols]); X_test_cat = ohe.transform(X_test_raw[categorical_cols])
else:
    X_train_cat = np.empty((len(X_train_raw),0)); X_test_cat = np.empty((len(X_test_raw),0))
joblib.dump(ohe, os.path.join(DEMO_DATA_DIR,'artifacts','onehot_encoder.joblib'))
# scale numeric
scaler = StandardScaler()
X_train_num = X_train_raw[numeric_cols].values if len(numeric_cols)>0 else np.empty((len(X_train_raw),0))
X_test_num = X_test_raw[numeric_cols].values if len(numeric_cols)>0 else np.empty((len(X_test_raw),0))
if X_train_num.size==0:
    X_train_num_scaled = X_train_num; X_test_num_scaled = X_test_num
else:
    X_train_num_scaled = scaler.fit_transform(X_train_num); X_test_num_scaled = scaler.transform(X_test_num)
joblib.dump(scaler, os.path.join(DEMO_DATA_DIR,'artifacts','scaler.joblib'))
# combine and save
X_train = np.hstack([X_train_num_scaled, X_train_cat]); X_test = np.hstack([X_test_num_scaled, X_test_cat])
try:
    ohe_cols = ohe.get_feature_names_out(categorical_cols).tolist()
except Exception:
    ohe_cols = []
final_cols = numeric_cols + ohe_cols
X_train_df = pd.DataFrame(X_train, columns=final_cols); X_test_df = pd.DataFrame(X_test, columns=final_cols)
X_train_df.to_csv(os.path.join(DEMO_DATA_DIR,'X_train.csv'), index=False); X_test_df.to_csv(os.path.join(DEMO_DATA_DIR,'X_test.csv'), index=False)
pd.DataFrame(y_train, columns=['target']).to_csv(os.path.join(DEMO_DATA_DIR,'y_train.csv'), index=False); pd.DataFrame(y_test, columns=['target']).to_csv(os.path.join(DEMO_DATA_DIR,'y_test.csv'), index=False)
pd.concat([X_train_df.assign(target=y_train), X_test_df.assign(target=y_test)], ignore_index=True).to_csv(os.path.join(DEMO_DATA_DIR,'obesity_clean.csv'), index=False)
print('Processed subset and saved files to', DEMO_DATA_DIR)

Processed subset and saved files to demo_data


## Run models on subset and save outputs to demo_results/

In [6]:
# PCA + SVM + PCA+LR + RandomForest + SelectKBest demo (updated)
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools

from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Ensure these are defined in prior cells; otherwise set defaults
DEMO_DATA_DIR = globals().get('DEMO_DATA_DIR', 'demo_data')
DEMO_RESULTS_DIR = globals().get('DEMO_RESULTS_DIR', 'demo_results')
RANDOM_STATE = globals().get('RANDOM_STATE', 42)

os.makedirs(DEMO_RESULTS_DIR, exist_ok=True)
os.makedirs(DEMO_DATA_DIR, exist_ok=True)

def plot_cm(cm, classes, out_path, title):
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    plt.figure(figsize=(6,5))
    plt.imshow(cm, cmap=plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    ticks = np.arange(len(classes))
    plt.xticks(ticks, classes, rotation=45)
    plt.yticks(ticks, classes)
    thresh = cm.max() / 2 if cm.size else 0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 ha='center',
                 color='white' if cm[i, j] > thresh else 'black')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

# ---- Load processed demo data ----
X_train = pd.read_csv(os.path.join(DEMO_DATA_DIR, 'X_train.csv'))
X_test  = pd.read_csv(os.path.join(DEMO_DATA_DIR, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(DEMO_DATA_DIR, 'y_train.csv')).squeeze()
y_test  = pd.read_csv(os.path.join(DEMO_DATA_DIR, 'y_test.csv')).squeeze()

# ---- PCA (shared for SVM & LR) ----
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train.values)
X_test_pca  = pca.transform(X_test.values)
os.makedirs(DEMO_RESULTS_DIR, exist_ok=True)
plt.figure(figsize=(6,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.title('PCA explained variance (demo)')
plt.tight_layout()
plt.savefig(os.path.join(DEMO_RESULTS_DIR, 'pca_explained_variance_demo.png'))
plt.close()

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)  # smaller CV for demo speed

# ---------------- SVM ----------------
print("Running PCA + SVM (demo)...")
svm = SVC()
param_grid = {'C':[0.1,1,10], 'gamma':['scale'], 'kernel':['rbf']}
grid = GridSearchCV(svm, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
grid.fit(X_train_pca, y_train)
best = grid.best_estimator_
print('SVM best params:', grid.best_params_)
best.fit(X_train_pca, y_train)
y_pred = best.predict(X_test_pca)
acc = accuracy_score(y_test, y_pred); f1m = f1_score(y_test, y_pred, average='macro'); cm = confusion_matrix(y_test, y_pred)
print('SVM demo -> acc:', acc, 'f1_macro:', f1m)
plot_cm(cm, sorted(map(str, np.unique(y_test))), os.path.join(DEMO_RESULTS_DIR, 'pca_svm_confusion_matrix_demo.png'), 'PCA + SVM (demo)')
pd.DataFrame([{'model':'PCA + SVM (demo)', 'pca_components': X_train_pca.shape[1], 'accuracy':acc, 'macro_f1':f1m}]).to_csv(os.path.join(DEMO_RESULTS_DIR, 'pca_svm_results_demo.csv'), index=False)

# ---------------- Logistic Regression ----------------
print("\nRunning PCA + Logistic Regression (demo)...")
model = LogisticRegression(multi_class='multinomial')
param_grid = {'C':[0.1,1,10], 'max_iter':[200], 'solver':['lbfgs']}
grid = GridSearchCV(model, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
grid.fit(X_train_pca, y_train)
print('LogReg best params:', grid.best_params_)
best = grid.best_estimator_
best.fit(X_train_pca, y_train)
y_pred = best.predict(X_test_pca)
acc = accuracy_score(y_test, y_pred); f1m = f1_score(y_test, y_pred, average='macro'); cm = confusion_matrix(y_test, y_pred)
print('LogReg demo -> acc:', acc, 'f1_macro:', f1m)
plot_cm(cm, sorted(map(str, np.unique(y_test))), os.path.join(DEMO_RESULTS_DIR, 'pca_logreg_confusion_matrix_demo.png'), 'PCA + Logistic Regression (demo)')
pd.DataFrame([{'model':'PCA + Logistic Regression (demo)', 'pca_components': X_train_pca.shape[1], 'accuracy':acc, 'macro_f1':f1m}]).to_csv(os.path.join(DEMO_RESULTS_DIR, 'pca_logreg_results_demo.csv'), index=False)

# ---------------- Random Forest ----------------
print("\nRunning Random Forest (demo)...")
rf = RandomForestClassifier(random_state=RANDOM_STATE)
param_grid = {'n_estimators':[100], 'max_depth':[None], 'min_samples_split':[2]}
grid = GridSearchCV(rf, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
grid.fit(X_train.values, y_train)
print('RF best params:', grid.best_params_)
best = grid.best_estimator_
best.fit(X_train.values, y_train)
y_pred = best.predict(X_test.values)
acc = accuracy_score(y_test, y_pred); f1m = f1_score(y_test, y_pred, average='macro'); cm = confusion_matrix(y_test, y_pred)
print('RF demo -> acc:', acc, 'f1_macro:', f1m)
# Save RF confusion matrix
plot_cm(cm, sorted(map(str, np.unique(y_test))), os.path.join(DEMO_RESULTS_DIR, 'rf_confusion_matrix_demo.png'), 'Random Forest (demo)')
pd.DataFrame([{'model':'Random Forest (demo)', 'accuracy':acc, 'macro_f1':f1m}]).to_csv(os.path.join(DEMO_RESULTS_DIR, 'rf_results_demo.csv'), index=False)

# ---------------- SelectKBest ----------------
print("\nRunning SelectKBest (demo) and training SVM & LogisticRegression on reduced features...")
# Decide k
k = min(6, X_train.shape[1]) if X_train.shape[1] > 0 else 0
if k <= 0:
    print("SelectKBest skipped (no features).")
else:
    selector = SelectKBest(score_func=f_classif, k=k)
    X_train_k = selector.fit_transform(X_train, y_train)
    X_test_k  = selector.transform(X_test)
    # Save reduced feature CSVs to demo_data for transparency
    pd.DataFrame(X_train_k).to_csv(os.path.join(DEMO_DATA_DIR, 'X_train_selectk.csv'), index=False)
    pd.DataFrame(X_test_k).to_csv(os.path.join(DEMO_DATA_DIR, 'X_test_selectk.csv'), index=False)

    models_k = [
        ('SVM_kbest', SVC()),
        ('LogReg_kbest', LogisticRegression(multi_class='multinomial'))
    ]
    results_k = []
    for name, mdl in models_k:
        print(f"\nRunning {name} on SelectKBest features...")
        if isinstance(mdl, SVC):
            param_grid = {'C':[0.1,1,10], 'gamma':['scale'], 'kernel':['rbf']}
        else:
            param_grid = {'C':[0.1,1,10], 'max_iter':[200], 'solver':['lbfgs']}

        grid = GridSearchCV(mdl, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
        grid.fit(X_train_k, y_train)
        print(f"{name} best params:", grid.best_params_)
        best = grid.best_estimator_
        best.fit(X_train_k, y_train)
        y_pred_k = best.predict(X_test_k)
        acc_k = accuracy_score(y_test, y_pred_k)
        f1m_k = f1_score(y_test, y_pred_k, average='macro')
        cm_k = confusion_matrix(y_test, y_pred_k)
        print(classification_report(y_test, y_pred_k))
        # save confusion matrix and results
        cm_name = os.path.join(DEMO_RESULTS_DIR, f'cm_kbest_{name}.png')
        plot_cm(cm_k, sorted(map(str, np.unique(y_test))), cm_name, f'{name} (SelectKBest demo)')
        results_k.append({'model': name, 'accuracy': acc_k, 'macro_f1': f1m_k})
        # individual results csv
        pd.DataFrame([{'model': name, 'accuracy': acc_k, 'macro_f1': f1m_k}]).to_csv(os.path.join(DEMO_RESULTS_DIR, f'kbest_results_{name}.csv'), index=False)

    # combined kbest results
    pd.DataFrame(results_k).to_csv(os.path.join(DEMO_RESULTS_DIR, 'kbest_results_demo.csv'), index=False)
    print("Saved SelectKBest demo results to", os.path.join(DEMO_RESULTS_DIR, 'kbest_results_demo.csv'))

# ---------------- Combine all demo result CSVs ----------------
print("\nCombining demo result CSVs...")
res_files = glob.glob(os.path.join(DEMO_RESULTS_DIR, '*_results_demo.csv')) + glob.glob(os.path.join(DEMO_RESULTS_DIR, 'kbest_results_demo.csv')) + glob.glob(os.path.join(DEMO_RESULTS_DIR, 'kbest_results_*.csv'))
# unique
res_files = sorted(set(res_files))
dfs = [pd.read_csv(p) for p in res_files if os.path.getsize(p) > 0]
if dfs:
    final = pd.concat(dfs, ignore_index=True)
    final.to_csv(os.path.join(DEMO_RESULTS_DIR, 'final_comparison_demo.csv'), index=False)
    print('Saved combined demo results to', os.path.join(DEMO_RESULTS_DIR, 'final_comparison_demo.csv'))
else:
    print('No demo result CSVs found to combine.')


Running PCA + SVM (demo)...
SVM best params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
SVM demo -> acc: 0.8943089430894309 f1_macro: 0.8877833891658776

Running PCA + Logistic Regression (demo)...
LogReg best params: {'C': 10, 'max_iter': 200, 'solver': 'lbfgs'}




LogReg demo -> acc: 0.9105691056910569 f1_macro: 0.9069803359940466

Running Random Forest (demo)...
RF best params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
RF demo -> acc: 0.9512195121951219 f1_macro: 0.9485075848521226

Running SelectKBest (demo) and training SVM & LogisticRegression on reduced features...

Running SVM_kbest on SelectKBest features...
SVM_kbest best params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        16
           1       1.00      0.82      0.90        17
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        17
           4       1.00      1.00      1.00        19
           5       0.87      0.76      0.81        17
           6       0.80      0.94      0.86        17

    accuracy                           0.93       123
   macro avg       0.94      0.93      0.93       123
weighted 



              precision    recall  f1-score   support

           0       0.89      1.00      0.94        16
           1       1.00      0.82      0.90        17
           2       0.89      0.85      0.87        20
           3       0.89      0.94      0.91        17
           4       1.00      1.00      1.00        19
           5       0.87      0.76      0.81        17
           6       0.80      0.94      0.86        17

    accuracy                           0.90       123
   macro avg       0.91      0.90      0.90       123
weighted avg       0.91      0.90      0.90       123

Saved SelectKBest demo results to demo_results\kbest_results_demo.csv

Combining demo result CSVs...
Saved combined demo results to demo_results\final_comparison_demo.csv
