# Obesity prediction (new method)

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

Data exploration

In [2]:
df = pd.read_csv(r'C:\Users\Saidabrorkhon\ML_Lectures\ObesityDataSet_raw_and_data_sinthetic.csv')

In [3]:
df.head(2)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   int64  
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [5]:
df.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [6]:
df.shape

(2111, 17)

In [8]:
df['Gender'].nunique()

2

Data Preprocesing - Encoding 

In [9]:
for col in df.columns:
  if df[col].dtype == 'object':
    cardinality = df[col].nunique()
    if cardinality <= 5:
      df = pd.get_dummies(df, columns=[col], dtype=int, drop_first=True)
    else:
      le = LabelEncoder()
      df[col] = le.fit_transform(df[col])

In [14]:
df.shape

(2111, 24)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 24 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 2111 non-null   int64  
 1   Height                              2111 non-null   float64
 2   Weight                              2111 non-null   float64
 3   FCVC                                2111 non-null   float64
 4   NCP                                 2111 non-null   float64
 5   CH2O                                2111 non-null   float64
 6   FAF                                 2111 non-null   float64
 7   TUE                                 2111 non-null   float64
 8   NObeyesdad                          2111 non-null   int64  
 9   Gender_Male                         2111 non-null   int64  
 10  family_history_with_overweight_yes  2111 non-null   int64  
 11  FAVC_yes                            2111 no

In [15]:
df.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad,Gender_Male,...,CAEC_no,SMOKE_yes,SCC_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21,1.62,-0.862558,2.0,3.0,2.0,0.0,1.0,1,0,...,0,0,0,0,0,1,0,0,1,0
1,21,1.52,-1.168077,3.0,3.0,3.0,3.0,0.0,1,0,...,0,1,1,0,1,0,0,0,1,0
2,23,1.8,-0.366089,2.0,3.0,2.0,2.0,1.0,1,1,...,0,0,0,1,0,0,0,0,1,0
3,27,1.8,0.015809,3.0,3.0,2.0,2.0,0.0,5,1,...,0,0,0,1,0,0,0,0,0,1
4,22,1.78,0.122741,2.0,1.0,2.0,0.0,0.0,6,1,...,0,0,0,0,1,0,0,0,1,0


Scaling

In [4]:
scaler = StandardScaler()
num_col = df.select_dtypes(include=['int64', 'float64']).columns
df[num_col] = scaler.fit_transform(df[num_col])

In [16]:
scaler = StandardScaler()
df['Weight'] = scaler.fit_transform(df['Weight'].values.reshape(-1,1))

In [17]:
df.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,NObeyesdad,Gender_Male,...,CAEC_no,SMOKE_yes,SCC_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21,1.62,-0.862558,2.0,3.0,2.0,0.0,1.0,1,0,...,0,0,0,0,0,1,0,0,1,0
1,21,1.52,-1.168077,3.0,3.0,3.0,3.0,0.0,1,0,...,0,1,1,0,1,0,0,0,1,0
2,23,1.8,-0.366089,2.0,3.0,2.0,2.0,1.0,1,1,...,0,0,0,1,0,0,0,0,1,0
3,27,1.8,0.015809,3.0,3.0,2.0,2.0,0.0,5,1,...,0,0,0,1,0,0,0,0,0,1
4,22,1.78,0.122741,2.0,1.0,2.0,0.0,0.0,6,1,...,0,0,0,0,1,0,0,0,1,0


Model train

In [19]:
x = df.drop(['NObeyesdad'], axis = 1)
y = df['NObeyesdad']

In [6]:
x_train,x_temp,y_train,y_temp=train_test_split(x,y,test_size=0.2,random_state=42)
x_test,x_val,y_test,y_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

NameError: name 'x' is not defined

Model selection - DecisionTreeRegressor

In [14]:
model = DecisionTreeRegressor()
dt_model = model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 0.09207787725986016


In [23]:
df['NObeyesdad'].nunique()

7

In [16]:
r2 = r2_score(y_test, y_pred)

In [19]:
r2

0.9004463147156337

Model Selection - Decision Tree Classifier

In [5]:
model = DecisionTreeClassifier()
dt_model = model.fit(x_train, y_train)

NameError: name 'x_train' is not defined

In [22]:
dt_model


In [23]:
y_pred=dt_model.predict(x_test)

In [24]:
accuracy=accuracy_score(y_test,y_pred)
print('Modelning aniqligi',accuracy*100)

Modelning aniqligi 91.9431279620853


In [4]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(dt_model, x, y, cv=kf, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)

print('Cross-Validation-Scores:', cv_scores)
print("Mean Cross Validation Score:", np.mean(cv_scores))

NameError: name 'dt_model' is not defined