In [1]:
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from joblib import dump

# project paths
ROOT = Path("..")
DATA_PROCESSED = ROOT / "data" / "processed" / "db1"
DATA_OUTPUTS   = ROOT / "data" / "outputs" / "db1"
MODELS_DIR     = ROOT / "models" / "db1"
REPORTS_DIR    = ROOT / "reports" / "db1"

for p in [DATA_OUTPUTS, MODELS_DIR, REPORTS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

In [2]:
# Expected files from your previous notebook:
#   data/processed/db1/X_feat_db1.npy
#   data/processed/db1/y_db1.npy
X = np.load(DATA_PROCESSED / "X_feat_db1.npy")
y = np.load(DATA_PROCESSED / "y_db1.npy")

print("X shape:", X.shape)  # (n_samples, n_features)
print("y shape:", y.shape)  # (n_samples,)
print("classes:", np.unique(y)[:10], "… total:", len(np.unique(y)))

X shape: (1105889, 190)
y shape: (1105889,)
classes: [ 1  2  3  4  5  6  7  8  9 10] … total: 23


In [3]:
SUBJECTS_PATH = DATA_PROCESSED / "subjects_db1.npy"
if SUBJECTS_PATH.exists():
    subjects = np.load(SUBJECTS_PATH)
    print("subjects shape:", subjects.shape, "unique:", np.unique(subjects))
else:
    subjects = None
    print("No subjects array found; using random sample split.")

No subjects array found; using random sample split.


In [4]:
Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=0
)
Xtr.shape, Xte.shape, ytr.shape, yte.shape

((829416, 190), (276473, 190), (829416,), (276473,))