In [31]:
import pandas as pd

df = pd.read_csv(r"C:\Users\aleaf\OneDrive\Desktop\Projects\obesity-levels-estimation-using-knn\data\raw\ObesityDataSet_raw_and_data_sinthetic.csv")
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [32]:
TARGET = "NObeyesdad"

# Phase 1 (already used): drop weight
X_no_weight = df.drop(columns=[TARGET, "Weight"]).copy()

# Phase 2: keep weight
X_with_weight = df.drop(columns=[TARGET]).copy()

y = df[TARGET]

X_no_weight

Unnamed: 0,Gender,Age,Height,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Female,21.000000,1.620000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation
1,Female,21.000000,1.520000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation
2,Male,23.000000,1.800000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation
3,Male,27.000000,1.800000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking
4,Male,22.000000,1.780000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation
2107,Female,21.982942,1.748584,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation
2108,Female,22.524036,1.752206,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation
2109,Female,24.361936,1.739450,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation


In [33]:
numeric_features = ["Age", "Height", "FCVC", "NCP", "CH2O", "FAF", "TUE"]

binary_features = [
    "Gender", "family_history_with_overweight",
    "FAVC", "SMOKE", "SCC"
]

ordinal_features = ["CAEC", "CALC"]

nominal_features = ["MTRANS"]

In [34]:
ordinal_mapping = {
    "CAEC": {"no": 0, "Sometimes": 1, "Frequently": 2, "Always": 3},
    "CALC": {"no": 0, "Sometimes": 1, "Frequently": 2, "Always": 3},
}

for col, mapping in ordinal_mapping.items():
    X_no_weight[col] = X_no_weight[col].map(mapping)
    X_with_weight[col] = X_with_weight[col].map(mapping)

In [35]:
binary_features = ["Gender", "family_history_with_overweight", "FAVC", "SMOKE", "SCC"]
binary_mapping = {"yes": 1, "no": 0, "Male": 1, "Female": 0}

for col in binary_features:
    X_no_weight[col] = X_no_weight[col].map(binary_mapping)
    X_with_weight[col] = X_with_weight[col].map(binary_mapping)

In [36]:
# One-hot encode MTRANS
X_no_weight = pd.get_dummies(X_no_weight, columns=["MTRANS"], drop_first=False)
X_with_weight = pd.get_dummies(X_with_weight, columns=["MTRANS"], drop_first=False)

# Align ONLY one-hot columns (not Weight)
common_columns = sorted(
    set(X_no_weight.columns).intersection(set(X_with_weight.columns))
)

X_no_weight = X_no_weight[common_columns]
X_with_weight = X_with_weight[common_columns + ["Weight"]]

In [37]:
from sklearn.preprocessing import StandardScaler

numeric_features_no_weight = ["Age", "Height", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
numeric_features_with_weight = numeric_features_no_weight + ["Weight"]
ordinal_features = ["CAEC", "CALC"]

scaler_no_weight = StandardScaler()
scaler_with_weight = StandardScaler()

X_no_weight[numeric_features_no_weight + ordinal_features] = scaler_no_weight.fit_transform(
    X_no_weight[numeric_features_no_weight + ordinal_features]
)

X_with_weight[numeric_features_with_weight + ordinal_features] = scaler_with_weight.fit_transform(
    X_with_weight[numeric_features_with_weight + ordinal_features]
)

In [38]:
from sklearn.metrics import pairwise_distances

euclidean_dist = pairwise_distances(X, metric="euclidean")
manhattan_dist = pairwise_distances(X, metric="manhattan")

In [39]:
X_no_weight.to_csv(
    r"C:\Users\aleaf\OneDrive\Desktop\Projects\obesity-levels-estimation-using-knn\data\processed\X_features.csv",
    index=False
)

X_with_weight.to_csv(
    r"C:\Users\aleaf\OneDrive\Desktop\Projects\obesity-levels-estimation-using-knn\data\processed\X_features_with_weight.csv",
    index=False
)

y.to_csv(
    r"C:\Users\aleaf\OneDrive\Desktop\Projects\obesity-levels-estimation-using-knn\data\processed\y_target.csv",
    index=False
)
