# Supervised Learning notebook

This notebook will be for finding inferences on the given obesity dataset with supervised learning algorithms.

The goal will be to see if the feature variables can predict the correct obesity classification reliably.

#### Initial import and data check

In [1]:
# initial imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score
)

# Set random seed
np.random.seed(13)   # unlucky for some

In [3]:
# import obesity data
obesity_df = pd.read_csv("assignment_dataset/ObesityDataSet_raw_and_data_sinthetic.csv")

#### Redo ordinal targets and X dataset

In [11]:
# Ordinal target mapping for 7 obesity levels
obesity_map = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}

# Create encoded target
y = obesity_df['NObeyesdad'].map(obesity_map)

# Reverse mapping for later visualisation. Swap k and v pairs.
reverse_map = {v: k for k, v in obesity_map.items()}

In [12]:
# Separate features from target
X = obesity_df.drop('NObeyesdad', axis=1)

#### Train test split stratified

In [20]:
# Stratified train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=13,    # unlucky for some
    stratify=y  # keeps classes even
)

#### Feature encoding

Rather than do individual cells, do it all as a function?

In [21]:
# define groups
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
binary_features = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
ordinal_features = ['CAEC', 'CALC']
nominal_features = ['Gender', 'MTRANS']

In [22]:
# function to encode features
def encode_features(df):
    """Apply encoding to feature dataframe"""
    df_enc = df.copy()

    # binary = yes/no -> 1/0
    for col in binary_features:
        df_enc[col] = (df_enc[col] == 'yes').astype(int)

    # ordinal =  map to 0/1/2/3
    ordinal_mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
    for col in ordinal_features:
        df_enc[col] = df_enc[col].map(ordinal_mapping)

    # nominal = one-hot encode (drop_first=True)
    df_enc = pd.get_dummies(df_enc, columns=nominal_features, drop_first=True)

    return df_enc

In [23]:
# Apply encoding to train and test
X_train_enc = encode_features(X_train)
X_test_enc = encode_features(X_test)

In [26]:
X_train_enc

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,Gender_Male,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
1063,45.000000,1.675953,79.668320,1,1,2.598051,3.000000,1,0,1.000000,0,0.000000,0.000000,0,False,False,False,False,False
118,23.000000,1.620000,53.000000,1,1,2.000000,3.000000,1,0,2.000000,0,1.000000,1.000000,1,True,False,False,True,False
877,18.000000,1.647971,68.818893,1,1,2.000000,1.411685,1,0,1.859089,0,0.000000,1.306000,0,False,False,False,True,False
480,18.000000,1.570000,50.000000,0,1,2.000000,3.000000,1,0,1.000000,0,0.000000,1.000000,1,False,False,False,True,False
447,19.000000,1.800000,87.000000,1,1,2.000000,4.000000,1,0,2.000000,0,2.000000,1.000000,1,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1633,30.475248,1.801368,121.094257,1,1,2.328469,3.000000,1,0,2.001208,0,0.800487,0.176678,1,True,False,False,False,False
541,19.717249,1.688426,49.660995,0,0,2.714447,3.000000,2,0,2.000000,0,1.903182,1.000000,1,False,False,False,True,False
1317,24.184891,1.768834,97.449743,1,1,2.000000,3.000000,1,0,2.973729,0,2.491642,1.365950,0,True,False,False,True,False
1035,17.971574,1.720379,85.000000,1,1,2.000000,3.000000,1,0,2.802498,0,1.000000,0.417580,1,True,False,False,True,False


In [27]:
X_test_enc

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,Gender_Male,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
743,18.381382,1.722547,53.783977,1,1,2.000000,3.131032,1,0,2.072194,0,1.487987,2.000000,1,True,False,False,True,False
280,21.000000,1.750000,62.000000,0,1,3.000000,4.000000,2,1,2.000000,0,0.000000,0.000000,1,True,False,False,True,False
261,28.000000,1.700000,73.000000,1,0,2.000000,3.000000,2,0,2.000000,1,2.000000,0.000000,1,False,False,False,False,True
1983,20.908785,1.700996,126.490236,1,1,3.000000,3.000000,1,0,1.242832,0,0.530925,0.575969,1,False,False,False,True,False
85,23.000000,1.650000,58.500000,1,0,2.000000,3.000000,1,0,2.000000,0,0.000000,0.000000,0,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,18.000000,1.750000,85.000000,1,0,2.000000,3.000000,1,0,3.000000,0,1.000000,0.000000,1,True,False,False,True,False
1083,24.122589,1.856759,95.887056,1,1,1.116068,2.449067,1,0,2.000000,0,0.926350,1.971170,1,True,False,False,True,False
967,32.278869,1.646020,74.147443,1,1,2.885178,2.562895,1,0,1.017006,0,0.588673,0.916291,1,False,False,False,False,False
1560,25.300208,1.765258,114.330023,1,1,1.562804,3.000000,1,0,2.075493,0,1.553734,0.000436,1,True,False,False,True,False
