# Prediction of Obesity Risk

In [83]:
#%pip install matplotlib
#%pip install seaborn
#%pip install pandas
#%pip install sklearn
#%pip install xgboost

In [84]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from xgboost import XGBClassifier

In [85]:
path_train = 'train.csv'
path_test = 'test.csv'

data_train = pd.read_csv(path_train)
data_test = pd.read_csv(path_test)

In [86]:
data_train.shape

(20758, 18)

In [87]:
data_test.shape

(13840, 17)

In [88]:
data_train.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [89]:
data_train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [90]:
data_train.describe()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,10378.5,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5992.46278,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,10378.5,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,15567.75,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,20757.0,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


In [91]:
# counting number of missing values in the train set
df = pd.DataFrame(data_train)
missing_val_train = df.isnull().sum()

print("Missing values in data :")
print(missing_val_train[missing_val_train > 0])

Missing values in data :
Series([], dtype: int64)


In [92]:
# counting number of missing values in the test set
df_test = pd.DataFrame(data_test)
missing_val_test = df_test.isnull().sum()

print("Missing values in data :")
print(missing_val_test[missing_val_test > 0])

Missing values in data :
Series([], dtype: int64)


In [93]:
# Identifying numerical columns in train set
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Identifying categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: Index(['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'], dtype='object')
Categorical Columns: Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')


In [94]:
# Show unique values in categorical columns in train set
for column in categorical_cols:
    print(f"Unique values in '{column}': {df[column].unique()}")

Unique values in 'Gender': ['Male' 'Female']
Unique values in 'family_history_with_overweight': ['yes' 'no']
Unique values in 'FAVC': ['yes' 'no']
Unique values in 'CAEC': ['Sometimes' 'Frequently' 'no' 'Always']
Unique values in 'SMOKE': ['no' 'yes']
Unique values in 'SCC': ['no' 'yes']
Unique values in 'CALC': ['Sometimes' 'no' 'Frequently']
Unique values in 'MTRANS': ['Public_Transportation' 'Automobile' 'Walking' 'Motorbike' 'Bike']
Unique values in 'NObeyesdad': ['Overweight_Level_II' 'Normal_Weight' 'Insufficient_Weight'
 'Obesity_Type_III' 'Obesity_Type_II' 'Overweight_Level_I'
 'Obesity_Type_I']


In [95]:
# Identifying numerical columns in test set
numerical_cols_test = df_test.select_dtypes(include=['int64', 'float64']).columns

# Identifying categorical columns in test set
categorical_cols_test = df_test.select_dtypes(include=['object', 'category']).columns

print("Numerical Columns:", numerical_cols_test)
print("Categorical Columns:", categorical_cols_test)

Numerical Columns: Index(['id', 'Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE'], dtype='object')
Categorical Columns: Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS'],
      dtype='object')


In [96]:
# Show unique values in categorical columns in test set
for column in categorical_cols_test:
    print(f"Unique values in '{column}': {df_test[column].unique()}")

Unique values in 'Gender': ['Male' 'Female']
Unique values in 'family_history_with_overweight': ['yes' 'no']
Unique values in 'FAVC': ['yes' 'no']
Unique values in 'CAEC': ['Sometimes' 'Always' 'Frequently' 'no']
Unique values in 'SMOKE': ['no' 'yes']
Unique values in 'SCC': ['no' 'yes']
Unique values in 'CALC': ['Sometimes' 'no' 'Frequently' 'Always']
Unique values in 'MTRANS': ['Public_Transportation' 'Automobile' 'Walking' 'Bike' 'Motorbike']


In [97]:
# Replace categorical values with numerical values in the train set
df['family_history_with_overweight'] = df['family_history_with_overweight'].replace({'yes': 1, 'no': 0})
df['FAVC'] = df['FAVC'].replace({'yes': 1, 'no': 0})
df['SMOKE'] = df['SMOKE'].replace({'yes': 1, 'no': 0})
df['SCC'] = df['SCC'].replace({'yes': 1, 'no': 0})
df['CAEC'] = df['CAEC'].replace({'Sometimes':1, 'Always':3, 'Frequently':2, 'no':0})
df['CALC'] = df['CALC'].replace({'Sometimes':1, 'Always':3, 'Frequently':2, 'no':0})
df['MTRANS'] = df['MTRANS'].replace({'Public_Transportation': 0, 'Automobile': 1, 'Walking': 2, 'Bike': 3, 'Motorbike': 4})
df['NObeyesdad'] = df['NObeyesdad'].replace({
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
})

# OneHot encoding the gender feature
gender_dummies = pd.get_dummies(df['Gender'], prefix='Gender')

# Joining the new one-hot encoded columns back to the original DataFrame
df = pd.concat([df, gender_dummies], axis=1)

# Drop the original columns
df.drop('Gender', axis=1, inplace=True)

In [98]:
# Replace categorical values with numerical values in the train set
df_test['family_history_with_overweight'] = df_test['family_history_with_overweight'].replace({'yes': 1, 'no': 0})
df_test['FAVC'] = df_test['FAVC'].replace({'yes': 1, 'no': 0})
df_test['SMOKE'] = df_test['SMOKE'].replace({'yes': 1, 'no': 0})
df_test['SCC'] = df_test['SCC'].replace({'yes': 1, 'no': 0})
df_test['CAEC'] = df_test['CAEC'].replace({'Sometimes':1, 'Always':3, 'Frequently':2, 'no':0})
df_test['CALC'] = df_test['CALC'].replace({'Sometimes':1, 'Always':3, 'Frequently':2, 'no':0})
df_test['MTRANS'] = df_test['MTRANS'].replace({'Public_Transportation': 0, 'Automobile': 1, 'Walking': 2, 'Bike': 3, 'Motorbike': 4})

# OneHot encoding the gender feature
gender_dummies = pd.get_dummies(df_test['Gender'], prefix='Gender')

# Joining the new one-hot encoded columns back to the original DataFrame
df_test = pd.concat([df_test, gender_dummies], axis=1)

# Drop the original columns
df_test.drop('Gender', axis=1, inplace=True)

In [99]:
df.head()

Unnamed: 0,id,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,Gender_Female,Gender_Male
0,0,24.443011,1.699998,81.66995,1,1,2.0,2.983297,1,0,2.763573,0,0.0,0.976473,1,0,3,False,True
1,1,18.0,1.56,57.0,1,1,2.0,3.0,2,0,2.0,0,1.0,1.0,0,1,1,True,False
2,2,18.0,1.71146,50.165754,1,1,1.880534,1.411685,1,0,1.910378,0,0.866045,1.673584,0,0,0,True,False
3,3,20.952737,1.71073,131.274851,1,1,3.0,3.0,1,0,1.674061,0,1.467863,0.780199,1,0,6,True,False
4,4,31.641081,1.914186,93.798055,1,1,2.679664,1.971472,1,0,1.979848,0,1.967973,0.931721,1,0,3,False,True


In [100]:
df_test.head()

Unnamed: 0,id,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,Gender_Female,Gender_Male
0,20758,26.899886,1.848294,120.644178,1,1,2.938616,3.0,1,0,2.825629,0,0.8554,0.0,1,0,False,True
1,20759,21.0,1.6,66.0,1,1,2.0,1.0,1,0,3.0,0,1.0,0.0,1,0,True,False
2,20760,26.0,1.643355,111.600553,1,1,3.0,3.0,1,0,2.621877,0,0.0,0.250502,1,0,True,False
3,20761,20.979254,1.553127,103.669116,1,1,2.0,2.977909,1,0,2.786417,0,0.094851,0.0,1,0,False,True
4,20762,26.0,1.627396,104.835346,1,1,3.0,3.0,1,0,2.653531,0,0.0,0.741069,1,0,True,False


In [101]:
# Define features and target
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier.
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    colsample_bytree=1.0,
    subsample=1.0,
    min_child_weight=1,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

# Train the model.
xgb_model.fit(X_train, y_train)

In [103]:
accuracy = xgb_model.score(X_test, y_test)
accuracy

0.9082369942196532

In [102]:
pred = pd.DataFrame()
pred['id']=df_test['id'].astype(int)
pred['NObeyesdad'] = xgb_model.predict(df_test)
# Replace the numerical values predicted to categorical values
pred['NObeyesdad'] = pred['NObeyesdad'].replace({
    0: 'Insufficient_Weight',
    1: 'Normal_Weight',
    2: 'Overweight_Level_I',
    3: 'Overweight_Level_II',
    4: 'Obesity_Type_I',
    5: 'Obesity_Type_II',
    6: 'Obesity_Type_III'
})
pred.to_csv('submission.csv',index=False)
pred

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
...,...,...
13835,34593,Overweight_Level_II
13836,34594,Overweight_Level_I
13837,34595,Insufficient_Weight
13838,34596,Normal_Weight
