In [None]:
CSV_PATH = "/content/expanded_fitness_data.csv"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix
)


In [None]:
# --- 2) Load dataset --------------------------------------------
df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
display(df.head(3))

Shape: (20000, 44)


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,...,Sets,Reps,Benefit,Burns Calories (per 30 min),Target Muscle Group,Equipment Needed,Difficulty Level,Body Part,Type of Muscle,Workout
0,34.91,Male,65.27,1.62,188.58,157.65,69.05,1.0,1080.9,Strength,...,4.99,20.91,Improves shoulder health and posture,342.58,"Shoulders, Triceps",Cable Machine,Advanced,Legs,Lats,Dumbbell flyes
1,23.37,Female,56.41,1.55,179.43,131.75,73.18,1.37,1809.91,HIIT,...,4.01,16.15,Strengthens lower abs,357.16,"Back, Core, Shoulders",Step or Box,Intermediate,Chest,Lats,Lateral raises
2,33.2,Female,58.98,1.67,175.04,123.95,54.96,0.91,802.26,Cardio,...,5.0,21.9,Builds chest strength,359.63,"Quadriceps, Glutes",Step or Box,Intermediate,Arms,Grip Strength,Standing calf raises


In [None]:
# --- 3) Tentukan target & fitur ---------------------------------
TARGET_COL = "is_healthy"  # ganti jika targetnya beda (harus biner 0/1)
assert TARGET_COL in df.columns, f"Target '{TARGET_COL}' tidak ada di dataset."


In [None]:
# --- 4) Train/Test split (stratified) ---------------------------
# Define features (X) and target (y)
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL]

# Remove rows with missing target values
mask = y.isna()
if mask.any():
  print(f"Removing {mask.sum()} rows with missing target values.")
  X = X[~mask]
  y = y[~mask]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("X_train:", X_train.shape, "| X_test:", X_test.shape)

Removing 1 rows with missing target values.
X_train: (14096, 43) | X_test: (3525, 43)


In [None]:
# --- 5) PREPROCESSING + ENCODING --------------------------------
# Numerik: imputasi median + StandardScaler
# Kategorikal: imputasi modus + OneHotEncoder(handle_unknown="ignore")
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

num_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])
cat_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot",  OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_tf, num_cols),
        ("cat", cat_tf, cat_cols),
    ],
    remainder="drop",
)

In [None]:
# --- 6) KNN model (dibungkus dengan preprocess) -----------------
knn = KNeighborsClassifier(n_neighbors=5, p=2, weights="uniform")
pipe = Pipeline(steps=[("prep", preprocess), ("knn", knn)])

y_train = y_train.astype(int)

# Train
pipe.fit(X_train, y_train)

In [None]:
# --- 7) Prediksi & METRICS --------------------------------------
y_pred = pipe.predict(X_test)

# Convert y_test to integer type to match y_pred
y_test = y_test.astype(int)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nAccuracy (test): {acc:.4f}")

# Classification report
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion Matrix (angka)
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
print("\nConfusion Matrix (array):\n", cm)


Accuracy (test): 0.9821

=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      3268
           1       0.82      0.96      0.89       257

    accuracy                           0.98      3525
   macro avg       0.91      0.97      0.94      3525
weighted avg       0.98      0.98      0.98      3525


Confusion Matrix (array):
 [[3214   54]
 [   9  248]]
