In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import zscore

In [55]:
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")

In [56]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [57]:
df.isnull().sum()

Unnamed: 0,0
Gender,0
Age,0
Height,0
Weight,0
family_history_with_overweight,0
FAVC,0
FCVC,0
NCP,0
CAEC,0
SMOKE,0


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   int64  
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [59]:
for col in df.columns:
  print(f"Column: {col}")
  print(df[col].value_counts())
  print("\n")

Column: Gender
Gender
Male      1068
Female    1043
Name: count, dtype: int64


Column: Age
Age
21    236
23    218
26    213
18    212
19    169
22    163
20    150
24     95
25     82
17     69
31     62
30     53
27     43
33     37
38     34
29     33
34     29
32     26
39     25
37     24
41     20
28     20
16     20
35     19
40     19
36      6
44      6
42      6
55      5
45      3
43      3
46      2
51      2
61      1
52      1
15      1
56      1
14      1
48      1
47      1
Name: count, dtype: int64


Column: Height
Height
1.70    125
1.75    122
1.62     96
1.76     96
1.65     88
1.60     77
1.72     76
1.63     75
1.77     71
1.71     68
1.64     66
1.74     66
1.67     66
1.79     65
1.78     64
1.61     62
1.68     61
1.80     59
1.66     58
1.69     54
1.82     50
1.73     43
1.84     40
1.56     39
1.85     39
1.81     39
1.83     35
1.55     32
1.57     30
1.59     29
1.58     27
1.53     27
1.87     22
1.86     21
1.54     20
1.52     19
1.50     17
1.91     1

**Encoding**

In [60]:
binarycols = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']

mappingbiner = {'Male': 1, 'Female': 0,
           'yes': 1, 'no': 0}

for col in binarycols:
  df[col] = df[col].map(mappingbiner)

In [61]:
labelcols = ['CALC', 'CAEC']

mappinglabel = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}

for col in labelcols:
  df[col] = df[col].map(mappinglabel)

In [62]:
label_encoder = LabelEncoder()
df['NObeyesdad'] = label_encoder.fit_transform(df['NObeyesdad'])

In [63]:
df = pd.get_dummies(df, columns=['MTRANS'], drop_first=True, dtype=int)

In [64]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,NObeyesdad,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,0,21,1.62,64.0,1,0,2.0,3.0,1,0,2.0,0,0.0,1.0,0,1,0,0,1,0
1,0,21,1.52,56.0,1,0,3.0,3.0,1,1,3.0,1,3.0,0.0,1,1,0,0,1,0
2,1,23,1.8,77.0,1,0,2.0,3.0,1,0,2.0,0,2.0,1.0,2,1,0,0,1,0
3,1,27,1.8,87.0,0,0,3.0,3.0,1,0,2.0,0,2.0,0.0,2,5,0,0,0,1
4,1,22,1.78,89.8,0,0,2.0,1.0,1,0,2.0,0,0.0,0.0,1,6,0,0,1,0


In [65]:
numcols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
z_scores = np.abs((df[numcols] - df[numcols].mean()) / df[numcols].std())

outliers = (z_scores > 3).sum()

In [66]:
outliers

Unnamed: 0,0
Age,23
Height,0
Weight,1
FCVC,0
NCP,0
CH2O,0
FAF,0
TUE,0


In [69]:
outliers_age = df.loc[z_scores['Age'] > 3, 'Age']
outliers_weight = df.loc[z_scores['Weight'] > 3, 'Weight']

print("Outlier Age:\n", outliers_age)
print("Outlier Weight:\n", outliers_weight)

Outlier Age:
 21      52
92      55
133     61
137     44
161     55
169     45
232     51
252     56
492     45
1013    55
1034    51
1063    45
1088    55
1101    46
1158    55
1179    48
1208    46
1215    44
1267    44
1286    47
1305    44
1386    44
1490    44
Name: Age, dtype: int64
Outlier Weight:
 344    173.0
Name: Weight, dtype: float64


In [78]:
for col in numcols:
    median_value = df[col].median()
    df[col] = np.where(z_scores[col] > 3, median_value, df[col])

In [79]:
scaler = StandardScaler()
df[numcols] = scaler.fit_transform(df[numcols])

In [80]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,...,SCC,FAF,TUE,CALC,NObeyesdad,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking,NObeyesdad_encoded
0,0,-0.525242,-0.87438,-0.863156,1,0,-0.78481,0.404102,1,0,...,0,-1.188028,0.562005,0,1,0,0,1,0,1
1,0,-0.525242,-1.94566,-1.169465,1,0,1.088307,0.404102,1,1,...,1,2.339676,-1.080619,1,1,0,0,1,0,1
2,1,-0.178584,1.053924,-0.365404,1,0,-0.78481,0.404102,1,0,...,0,1.163774,0.562005,2,1,0,0,1,0,1
3,1,0.514732,1.053924,0.017483,0,0,1.088307,0.404102,1,0,...,0,1.163774,-1.080619,2,5,0,0,0,1,5
4,1,-0.351913,0.839668,0.124691,0,0,-0.78481,-2.166941,1,0,...,0,-1.188028,-1.080619,1,6,0,0,1,0,6
