# Obesity Level Prediction Application

In [42]:
import pandas as pd    
import numpy as np   

In [43]:
data = pd.read_csv('ObesityDataSet.csv')

In [44]:
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   int64  
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

## Data Cleaning and Summary Statistics

In [46]:
data.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.315964,1.70162,86.586035,2.418986,2.685651,2.008053,1.010313,0.657861
std,6.357078,0.093368,26.191163,0.533996,0.778079,0.61295,0.850613,0.608926
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,20.0,1.63,65.47,2.0,2.66,1.585,0.125,0.0
50%,23.0,1.7,83.0,2.39,3.0,2.0,1.0,0.625
75%,26.0,1.77,107.43,3.0,3.0,2.48,1.67,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [47]:
data['Gender'].value_counts()

Gender
Male      1068
Female    1043
Name: count, dtype: int64

## Feature Engineering

In [48]:
from feature_engine.encoding import OrdinalEncoder 

In [49]:
x = data.drop(columns=['NObeyesdad'],axis=1)
y = data['NObeyesdad']

In [50]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [51]:
x_train.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
162,Female,21,1.63,60.0,yes,yes,3.0,3.0,Always,yes,2.0,no,2.0,0.0,Sometimes,Public_Transportation
2001,Female,21,1.75,133.62,yes,yes,3.0,3.0,Sometimes,no,2.89,no,1.48,0.78,Sometimes,Public_Transportation
1435,Female,23,1.66,82.6,yes,yes,1.2,1.36,Sometimes,no,2.77,no,0.13,1.659,Sometimes,Public_Transportation
649,Female,22,1.59,44.24,no,no,3.0,1.7,Frequently,no,2.55,no,1.1,0.0,no,Public_Transportation
1280,Male,26,1.81,106.04,yes,yes,3.0,3.0,Sometimes,no,2.86,no,1.81,0.68,Sometimes,Public_Transportation


In [52]:
oe1 = OrdinalEncoder(encoding_method='arbitrary', variables=['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'CALC', 'MTRANS', 'SCC'])

In [53]:
x_train = oe1.fit_transform(x_train)
x_test = oe1.transform(x_test)

In [54]:
x_train.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
162,0,21,1.63,60.0,0,0,3.0,3.0,0,0,2.0,0,2.0,0.0,0,0
2001,0,21,1.75,133.62,0,0,3.0,3.0,1,1,2.89,0,1.48,0.78,0,0
1435,0,23,1.66,82.6,0,0,1.2,1.36,1,1,2.77,0,0.13,1.659,0,0
649,0,22,1.59,44.24,1,1,3.0,1.7,2,1,2.55,0,1.1,0.0,1,0
1280,1,26,1.81,106.04,0,0,3.0,3.0,1,1,2.86,0,1.81,0.68,0,0


In [55]:
oe1.encoder_dict_

{'Gender': {'Female': 0, 'Male': 1},
 'family_history_with_overweight': {'yes': 0, 'no': 1},
 'FAVC': {'yes': 0, 'no': 1},
 'CAEC': {'Always': 0, 'Sometimes': 1, 'Frequently': 2, 'no': 3},
 'SMOKE': {'yes': 0, 'no': 1},
 'CALC': {'Sometimes': 0, 'no': 1, 'Frequently': 2, 'Always': 3},
 'MTRANS': {'Public_Transportation': 0,
  'Automobile': 1,
  'Motorbike': 2,
  'Walking': 3,
  'Bike': 4},
 'SCC': {'no': 0, 'yes': 1}}

In [56]:
y_train = pd.DataFrame(y_train, columns=['NObeyesdad'])
y_test = pd.DataFrame(y_test, columns=['NObeyesdad'])

In [57]:
oe2 = OrdinalEncoder(encoding_method='arbitrary', variables=['NObeyesdad'])
y_train = oe2.fit_transform(y_train)
y_test = oe2.transform(y_test)

In [58]:
y_train.head()

Unnamed: 0,NObeyesdad
162,0
2001,1
1435,2
649,3
1280,2


In [59]:
oe2.encoder_dict_

{'NObeyesdad': {'Normal_Weight': 0,
  'Obesity_Type_III': 1,
  'Obesity_Type_I': 2,
  'Insufficient_Weight': 3,
  'Obesity_Type_II': 4,
  'Overweight_Level_II': 5,
  'Overweight_Level_I': 6}}

In [60]:
from sklearn.preprocessing import MinMaxScaler
mn = MinMaxScaler()
x_train_mn = mn.fit_transform(x_train)
x_test_mn = mn.transform(x_test)

In [61]:
x_train_mn

array([[0.        , 0.14893617, 0.33962264, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.14893617, 0.56603774, ..., 0.39      , 0.        ,
        0.        ],
       [0.        , 0.19148936, 0.39622642, ..., 0.8295    , 0.        ,
        0.        ],
       ...,
       [0.        , 0.19148936, 0.37735849, ..., 1.        , 0.33333333,
        0.        ],
       [0.        , 0.19148936, 0.33962264, ..., 0.3155    , 0.33333333,
        0.        ],
       [1.        , 0.12765957, 0.69811321, ..., 0.281     , 0.        ,
        0.        ]], shape=(1688, 16))

In [62]:
## Making scaled features a dataframe for feature selection
x_train = pd.DataFrame(x_train_mn, columns=x_train.columns)
x_test = pd.DataFrame(x_test_mn, columns=x_test.columns)


In [63]:
x_train.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,0.0,0.148936,0.339623,0.156716,0.0,0.0,1.0,0.666667,0.0,0.0,0.5,0.0,0.666667,0.0,0.0,0.0
1,0.0,0.148936,0.566038,0.706119,0.0,0.0,1.0,0.666667,0.333333,1.0,0.945,0.0,0.493333,0.39,0.0,0.0
2,0.0,0.191489,0.396226,0.325373,0.0,0.0,0.1,0.12,0.333333,1.0,0.885,0.0,0.043333,0.8295,0.0,0.0
3,0.0,0.170213,0.264151,0.039104,1.0,1.0,1.0,0.233333,0.666667,1.0,0.775,0.0,0.366667,0.0,0.333333,0.0
4,1.0,0.255319,0.679245,0.500299,0.0,0.0,1.0,0.666667,0.333333,1.0,0.93,0.0,0.603333,0.34,0.0,0.0


In [64]:
x_train= np.round(x_train, 2)
x_test= np.round(x_test, 2)

In [65]:
x_train.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,0.0,0.15,0.34,0.16,0.0,0.0,1.0,0.67,0.0,0.0,0.5,0.0,0.67,0.0,0.0,0.0
1,0.0,0.15,0.57,0.71,0.0,0.0,1.0,0.67,0.33,1.0,0.94,0.0,0.49,0.39,0.0,0.0
2,0.0,0.19,0.4,0.33,0.0,0.0,0.1,0.12,0.33,1.0,0.88,0.0,0.04,0.83,0.0,0.0
3,0.0,0.17,0.26,0.04,1.0,1.0,1.0,0.23,0.67,1.0,0.77,0.0,0.37,0.0,0.33,0.0
4,1.0,0.26,0.68,0.5,0.0,0.0,1.0,0.67,0.33,1.0,0.93,0.0,0.6,0.34,0.0,0.0


## Feature Selection

In [66]:
## Feature Selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
fs = SelectKBest(score_func=chi2, k=10)
x_train_fs = fs.fit_transform(x_train, y_train)
x_test_fs = fs.transform


In [67]:
print(x_train_fs.shape)  # check shape after feature selection
print(x_train.columns[fs.get_support()])  # check column names selected

(1688, 10)
Index(['Gender', 'Age', 'Weight', 'family_history_with_overweight', 'FAVC',
       'FCVC', 'SCC', 'FAF', 'CALC', 'MTRANS'],
      dtype='object')


In [69]:
dt = data[['Gender', 'Age', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'SCC', 'FAF', 'CALC', 'MTRANS']]

In [70]:
dt.head()

Unnamed: 0,Gender,Age,Weight,family_history_with_overweight,FAVC,FCVC,SCC,FAF,CALC,MTRANS
0,Female,21,64.0,yes,no,2.0,no,0.0,no,Public_Transportation
1,Female,21,56.0,yes,no,3.0,yes,3.0,Sometimes,Public_Transportation
2,Male,23,77.0,yes,no,2.0,no,2.0,Frequently,Public_Transportation
3,Male,27,87.0,no,no,3.0,no,2.0,Frequently,Walking
4,Male,22,89.8,no,no,2.0,no,0.0,Sometimes,Public_Transportation


In [71]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   int64  
 2   Weight                          2111 non-null   float64
 3   family_history_with_overweight  2111 non-null   object 
 4   FAVC                            2111 non-null   object 
 5   FCVC                            2111 non-null   float64
 6   SCC                             2111 non-null   object 
 7   FAF                             2111 non-null   float64
 8   CALC                            2111 non-null   object 
 9   MTRANS                          2111 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 165.1+ KB


In [75]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1688 entries, 0 to 1687
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          1688 non-null   float64
 1   Age                             1688 non-null   float64
 2   Height                          1688 non-null   float64
 3   Weight                          1688 non-null   float64
 4   family_history_with_overweight  1688 non-null   float64
 5   FAVC                            1688 non-null   float64
 6   FCVC                            1688 non-null   float64
 7   NCP                             1688 non-null   float64
 8   CAEC                            1688 non-null   float64
 9   SMOKE                           1688 non-null   float64
 10  CH2O                            1688 non-null   float64
 11  SCC                             1688 non-null   float64
 12  FAF                             16

In [78]:
x_train,x_test,y_train,y_test = train_test_split(dt, y, test_size=0.2, random_state=42)

In [80]:
oe3 = OrdinalEncoder(encoding_method='arbitrary',variables=['Gender', 'family_history_with_overweight', 'FAVC', 'SCC', 'CALC', 'MTRANS'])
x_train = oe3.fit_transform(x_train)
x_test = oe3.transform(x_test)

In [81]:
x_train.head()

Unnamed: 0,Gender,Age,Weight,family_history_with_overweight,FAVC,FCVC,SCC,FAF,CALC,MTRANS
162,0,21,60.0,0,0,3.0,0,2.0,0,0
2001,0,21,133.62,0,0,3.0,0,1.48,0,0
1435,0,23,82.6,0,0,1.2,0,0.13,0,0
649,0,22,44.24,1,1,3.0,0,1.1,1,0
1280,1,26,106.04,0,0,3.0,0,1.81,0,0


In [83]:
from sklearn.preprocessing import MinMaxScaler
mn = MinMaxScaler()
x_train_mn = mn.fit_transform(x_train)
x_test_mn = mn.transform(x_test)

## Model Building

#### trying LogisticRegression

In [84]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(max_iter=1000, random_state=42)
model1.fit(x_train_mn, y_train)
y_pred = model1.predict(x_test_mn)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)

Logistic Regression Accuracy: 0.6973995271867612


#### trying SVM

In [30]:
from sklearn.svm import SVC
model_2 = SVC(kernel='rbf', random_state=42)
model_2.fit(x_train, y_train)
y_pred = model_2.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("SVM Accuracy:", accuracy)

  y = column_or_1d(y, warn=True)


SVM Accuracy: 0.7399527186761229


#### trying KNeighbors

In [89]:
from sklearn.neighbors import KNeighborsClassifier
model_9 = KNeighborsClassifier(n_neighbors=5)
model_9.fit(x_train, y_train)
y_pred = model_9.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("KNN Accuracy:", accuracy)

KNN Accuracy: 0.8699763593380615


#### trying neural network

In [33]:
from sklearn.neural_network import MLPClassifier
model_3 = MLPClassifier(max_iter=1000, random_state=42)
model_3.fit(x_train, y_train)
y_pred = model_3.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("MLP Accuracy:", accuracy)

  y = column_or_1d(y, warn=True)


MLP Accuracy: 0.8557919621749409




#### trying RandomForest

In [90]:
from sklearn.ensemble import RandomForestClassifier
mod = RandomForestClassifier(n_estimators=200, max_depth= 12, random_state=42)
mod.fit(x_train_mn, y_train)
y_pred = mod.predict(x_test_mn)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy_score(y_test, y_pred)

0.9125295508274232

#### Model Deployment

In [87]:
import joblib
import pickle

In [91]:
pickle.dump(mod, open('mod.pkl', 'wb')) ## pushing the model to disk
pickle.dump(mn, open('scaler.pkl', 'wb')) ## pushing the scaler to disk
pickle.dump(oe3, open('encoder.pkl', 'wb')) ## pushing the encoder to disk


In [None]:
pickle.dump(oe2, open('target_encoder.pkl', 'wb')) ## pushing the target encoder to disk