In [31]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


# Fetch dataset
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
obesity_level = fetch_ucirepo(id=544)

# data (as pandas dataframes) 

x = obesity_level.data.features
y = obesity_level.data.targets


features_df = pd.DataFrame(obesity_level['data']['features']) # This only has features
targets_df = pd.DataFrame(obesity_level['data']['targets']) # This only has targets


frames = [features_df, targets_df]
df = pd.concat(frames, axis=1)

#Drops duplicates
df.drop_duplicates(inplace=True)

#Drops NaN
df.dropna(axis=1, inplace=True)

print(df.shape)

df.nunique()


(2087, 17)


Gender                               2
Age                               1402
Height                            1574
Weight                            1525
family_history_with_overweight       2
FAVC                                 2
FCVC                               810
NCP                                635
CAEC                                 4
SMOKE                                2
CH2O                              1268
SCC                                  2
FAF                               1190
TUE                               1129
CALC                                 4
MTRANS                               5
NObeyesdad                           7
dtype: int64

In [32]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
pd.set_option('display.max_columns', None)

# print(df.dtypes)
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print(categorical_cols)

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(numeric_cols)

Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')
Index(['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'], dtype='object')


In [33]:
# Initialize LabelEncoder
le = LabelEncoder()

# Apply Label Encoding to each categorical column
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


print(df.head())

   Gender   Age  Height  Weight  family_history_with_overweight  FAVC  FCVC  \
0       0  21.0    1.62    64.0                               1     0   2.0   
1       0  21.0    1.52    56.0                               1     0   3.0   
2       1  23.0    1.80    77.0                               1     0   2.0   
3       1  27.0    1.80    87.0                               0     0   3.0   
4       1  22.0    1.78    89.8                               0     0   2.0   

   NCP  CAEC  SMOKE  CH2O  SCC  FAF  TUE  CALC  MTRANS  NObeyesdad  
0  3.0     2      0   2.0    0  0.0  1.0     3       3           1  
1  3.0     2      1   3.0    1  3.0  0.0     2       3           1  
2  3.0     2      0   2.0    0  2.0  1.0     1       3           1  
3  3.0     2      0   2.0    0  2.0  0.0     1       4           5  
4  1.0     2      0   2.0    0  0.0  0.0     2       3           6  


In [34]:
# Removing outliers (occurs after encoding but before scaling)
print(df.shape)
# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Calculate Z-scores for numeric columns
z_scores = np.abs((numeric_df - numeric_df.mean()) / numeric_df.std())

# Filter out outliers
df_cleaned = df[(z_scores < 3).all(axis=1)]

features_df = df.drop(columns='NObeyesdad')
targets_df = df[['NObeyesdad']]

print(df_cleaned.shape)
print(features_df.head())
print(targets_df.head())

(2087, 17)
(1891, 17)
   Gender   Age  Height  Weight  family_history_with_overweight  FAVC  FCVC  \
0       0  21.0    1.62    64.0                               1     0   2.0   
1       0  21.0    1.52    56.0                               1     0   3.0   
2       1  23.0    1.80    77.0                               1     0   2.0   
3       1  27.0    1.80    87.0                               0     0   3.0   
4       1  22.0    1.78    89.8                               0     0   2.0   

   NCP  CAEC  SMOKE  CH2O  SCC  FAF  TUE  CALC  MTRANS  
0  3.0     2      0   2.0    0  0.0  1.0     3       3  
1  3.0     2      1   3.0    1  3.0  0.0     2       3  
2  3.0     2      0   2.0    0  2.0  1.0     1       3  
3  3.0     2      0   2.0    0  2.0  0.0     1       4  
4  1.0     2      0   2.0    0  0.0  0.0     2       3  
   NObeyesdad
0           1
1           1
2           1
3           5
4           6


In [35]:
df.corr()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
Gender,1.0,0.050641,0.626748,0.163176,0.113492,0.06122,-0.271575,0.077863,0.074564,0.045501,0.095129,-0.102435,0.189471,0.022356,0.010574,-0.139044,0.014699
Age,0.050641,1.0,-0.031748,0.19816,0.200379,0.063895,0.013572,-0.055823,0.092097,0.091261,-0.044058,-0.117959,-0.148202,-0.302927,-0.045565,-0.601476,0.238308
Height,0.626748,-0.031748,1.0,0.457468,0.232258,0.180694,-0.040363,0.227806,0.058001,0.054326,0.220487,-0.137078,0.293584,0.041808,-0.135756,-0.068258,0.0387
Weight,0.163176,0.19816,0.457468,1.0,0.492969,0.274655,0.216574,0.092149,0.300271,0.024369,0.203823,-0.205409,-0.05649,-0.079351,-0.211351,0.009836,0.388802
family_history_with_overweight,0.113492,0.200379,0.232258,0.492969,1.0,0.214329,0.033199,0.028411,0.207738,0.014885,0.168627,-0.193947,-0.062937,0.002314,0.028403,-0.09273,0.330391
FAVC,0.06122,0.063895,0.180694,0.274655,0.214329,1.0,-0.025419,-0.006398,0.147921,-0.050713,0.002993,-0.191277,-0.111184,0.071505,-0.087661,-0.069422,0.041023
FCVC,-0.271575,0.013572,-0.040363,0.216574,0.033199,-0.025419,1.0,0.034885,-0.038565,0.013716,0.081332,0.071179,0.022003,-0.104128,-0.063132,0.069012,0.025728
NCP,0.077863,-0.055823,0.227806,0.092149,0.028411,-0.006398,0.034885,1.0,-0.072316,0.005009,0.075335,-0.020461,0.127816,0.015693,-0.082985,-0.043648,-0.088235
CAEC,0.074564,0.092097,0.058001,0.300271,0.207738,0.147921,-0.038565,-0.072316,1.0,-0.05496,0.114801,-0.109393,-0.034803,-0.036708,-0.039502,-0.058158,0.31451
SMOKE,0.045501,0.091261,0.054326,0.024369,0.014885,-0.050713,0.013716,0.005009,-0.05496,1.0,-0.031642,0.047384,0.010811,0.016491,-0.083181,-0.009995,-0.023341


In [36]:
# Standardize the numeric columns
scaler = StandardScaler()

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

features_df = df.drop(columns=['NObeyesdad'])
targets_df = df['NObeyesdad']

print(df.head())

   Gender       Age    Height    Weight  family_history_with_overweight  FAVC  \
0       0 -0.526613 -0.887408 -0.872985                               1     0   
1       0 -0.526613 -1.960788 -1.178508                               1     0   
2       1 -0.212507  1.044677 -0.376509                               1     0   
3       1  0.415705  1.044677  0.005395                               0     0   
4       1 -0.369560  0.830001  0.112328                               0     0   

       FCVC       NCP  CAEC  SMOKE      CH2O  SCC       FAF       TUE  CALC  \
0 -0.788364  0.390906     2      0 -0.007810    0 -1.186977  0.554211     3   
1  1.082164  0.390906     2      1  1.636552    1  2.328908 -1.090505     2   
2 -0.788364  0.390906     2      0 -0.007810    0  1.156947  0.554211     1   
3  1.082164  0.390906     2      0 -0.007810    0  1.156947 -1.090505     1   
4 -0.788364 -2.225418     2      0 -0.007810    0 -1.186977 -1.090505     2   

   MTRANS  NObeyesdad  
0       3     

In [37]:
df.corr()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
Gender,1.0,0.050641,0.626748,0.163176,0.113492,0.06122,-0.271575,0.077863,0.074564,0.045501,0.095129,-0.102435,0.189471,0.022356,0.010574,-0.139044,0.014699
Age,0.050641,1.0,-0.031748,0.19816,0.200379,0.063895,0.013572,-0.055823,0.092097,0.091261,-0.044058,-0.117959,-0.148202,-0.302927,-0.045565,-0.601476,0.238308
Height,0.626748,-0.031748,1.0,0.457468,0.232258,0.180694,-0.040363,0.227806,0.058001,0.054326,0.220487,-0.137078,0.293584,0.041808,-0.135756,-0.068258,0.0387
Weight,0.163176,0.19816,0.457468,1.0,0.492969,0.274655,0.216574,0.092149,0.300271,0.024369,0.203823,-0.205409,-0.05649,-0.079351,-0.211351,0.009836,0.388802
family_history_with_overweight,0.113492,0.200379,0.232258,0.492969,1.0,0.214329,0.033199,0.028411,0.207738,0.014885,0.168627,-0.193947,-0.062937,0.002314,0.028403,-0.09273,0.330391
FAVC,0.06122,0.063895,0.180694,0.274655,0.214329,1.0,-0.025419,-0.006398,0.147921,-0.050713,0.002993,-0.191277,-0.111184,0.071505,-0.087661,-0.069422,0.041023
FCVC,-0.271575,0.013572,-0.040363,0.216574,0.033199,-0.025419,1.0,0.034885,-0.038565,0.013716,0.081332,0.071179,0.022003,-0.104128,-0.063132,0.069012,0.025728
NCP,0.077863,-0.055823,0.227806,0.092149,0.028411,-0.006398,0.034885,1.0,-0.072316,0.005009,0.075335,-0.020461,0.127816,0.015693,-0.082985,-0.043648,-0.088235
CAEC,0.074564,0.092097,0.058001,0.300271,0.207738,0.147921,-0.038565,-0.072316,1.0,-0.05496,0.114801,-0.109393,-0.034803,-0.036708,-0.039502,-0.058158,0.31451
SMOKE,0.045501,0.091261,0.054326,0.024369,0.014885,-0.050713,0.013716,0.005009,-0.05496,1.0,-0.031642,0.047384,0.010811,0.016491,-0.083181,-0.009995,-0.023341


In [38]:
# features_df.drop(columns=["Gender", "Height", "FAVC", "FCVC", "SMOKE", "MTRANS"], inplace=True)
features_df = features_df[['Age','Height','Weight','family_history_with_overweight','CAEC']]

df.nunique()

Gender                               2
Age                               1402
Height                            1574
Weight                            1525
family_history_with_overweight       2
FAVC                                 2
FCVC                               810
NCP                                635
CAEC                                 4
SMOKE                                2
CH2O                              1268
SCC                                  2
FAF                               1190
TUE                               1129
CALC                                 4
MTRANS                               5
NObeyesdad                           7
dtype: int64

In [39]:
# 90/10 train test split

print(features_df.shape)
print(targets_df.shape)
x_train, x_test, y_train, y_test = train_test_split(features_df, targets_df, test_size=0.1, random_state=10)

#Train Model. SGDRegressor uses Stochastic Gradient Descent method
model = SGDRegressor(max_iter=1000, tol=1e-3, learning_rate='adaptive', eta0=0.001, random_state=10)
model.fit(x_train, y_train)
model.coef_

(2087, 5)
(2087,)


array([ 0.25881899, -0.26481541,  0.56012401,  0.77540258,  0.84871292])

In [40]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print(f'R-squared (Train): {r2_train}')
print(f'R-squared (Test): {r2_test}')

print(f'MSE (Train): {mse_train}')
print(f'MSE (Test): {mse_test}')

R-squared (Train): 0.24011901649834433
R-squared (Test): 0.32954922190013347
MSE (Train): 2.866313848574987
MSE (Test): 2.675141734736516
