In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [37]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [38]:
train_df.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [39]:
train_df.isna().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [40]:
test_df.isna().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
dtype: int64

In [41]:
train_df.dtypes

id                                  int64
Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
NObeyesdad                         object
dtype: object

In [42]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
binary_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
for col in binary_columns:
    train_df[col] = label_encoder.fit_transform(train_df[col])
    test_df[col] = label_encoder.fit_transform(test_df[col])
nominal_columns = ['CAEC', 'CALC', 'MTRANS']
train_df = pd.get_dummies(train_df, columns=nominal_columns)
test_df = pd.get_dummies(test_df, columns=nominal_columns)

target_encoder = LabelEncoder()
train_df['NObeyesdad'] = target_encoder.fit_transform(train_df['NObeyesdad'])



In [43]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [44]:

Q1 = train_df['Weight'].quantile(0.25)
Q3 = train_df['Weight'].quantile(0.75)
IQR = Q3 - Q1

outliers = train_df[(train_df['Weight'] < (Q1 - 1.5 * IQR)) | (train_df['Weight'] > (Q3 + 1.5 * IQR))]

print(f"Number of outliers in 'Weight': {len(outliers)}")


Number of outliers in 'Weight': 0


In [45]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X = train_df.drop('NObeyesdad', axis=1)
y = train_df['NObeyesdad']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


rf_clf = RandomForestClassifier(n_estimators=350,
                                max_depth=7,
                                min_samples_split=2,
                                max_leaf_nodes=8, 
                                random_state=42)
xgb_clf = XGBClassifier(use_label_encoder=False, 
                        learning_rate = 0.01,
                        n_estimators = 2000,
                        min_child_weight = 0.6,
                        subsample = 0.2,
                        eval_metric='mlogloss', random_state=42)

voting_clf = VotingClassifier(
    estimators=[('rf', rf_clf), ('xgb', xgb_clf)],
    voting='soft'  
)


voting_clf.fit(X_train, y_train)


predictions = voting_clf.predict(X_test)


accuracy = accuracy_score(y_test, predictions)

print(f"Ensemble Accuracy: {accuracy:.4f}")


Ensemble Accuracy: 0.9061


In [46]:
print(set(test_df.columns)-set(train_df.columns) , "\n\n",train_df.columns)

{'CALC_Always'} 

 Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'SMOKE',
       'CH2O', 'SCC', 'FAF', 'TUE', 'NObeyesdad', 'CAEC_Always',
       'CAEC_Frequently', 'CAEC_Sometimes', 'CAEC_no', 'CALC_Frequently',
       'CALC_Sometimes', 'CALC_no', 'MTRANS_Automobile', 'MTRANS_Bike',
       'MTRANS_Motorbike', 'MTRANS_Public_Transportation', 'MTRANS_Walking'],
      dtype='object')


In [47]:

for col in X_train.columns:
    if col not in test_df.columns:
        test_df[col] = 0  

test_df_aligned = test_df[X_train.columns]


predictions_test_df = voting_clf.predict(test_df_aligned)

predicted_labels = target_encoder.inverse_transform(predictions_test_df)

test_df['NObeyesdad'] = predicted_labels

final_output = test_df[['id', 'NObeyesdad']]


print(final_output.head())


      id          NObeyesdad
0  20758     Obesity_Type_II
1  20759  Overweight_Level_I
2  20760    Obesity_Type_III
3  20761      Obesity_Type_I
4  20762    Obesity_Type_III


In [48]:
final_output.to_csv("Comp1_qc7205.csv", index=False)