In [1]:
import numpy as np
import pandas as pd
import tqdm as tqdm
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report, accuracy_score
import gc
from lightgbm import LGBMClassifier

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
train.head(5)

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [4]:
old_data = pd.read_csv("/kaggle/input/obesity-or-cvd-risk-classifyregressorcluster/ObesityDataSet.csv")
old_data.head(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [5]:
old_data['id'] = range(len(train)+1,len(train) + len(old_data)+1 )

In [6]:
old_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [7]:
train = pd.concat([train, old_data]).drop_duplicates()
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [8]:
def data_cleaning(df,train=False):
    scaler = StandardScaler()
    df['IsYoung'] = df['Age'].apply(lambda x: x < 25)
    df['IsAging'] = df['Age'].apply(lambda x: 25 <= x < 40)
    df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2
    numerical_attributes = [x for x in df.keys() if df.dtypes[x] == 'int64' or df.dtypes[x] == 'float64' ]
    if train:
        y = df['NObeyesdad']
        df = df.drop('NObeyesdad',axis=1)
    categorical_attributes = [x for x in df.keys() if df.dtypes[x] == 'object' ]
    df[numerical_attributes] = scaler.fit_transform(df[numerical_attributes])
#     for key in ['Gender','family_history_with_overweight','FAVC','IsYoung']:
#         _ = df[key][0]
#         df[key] = df[key].apply(lambda x: 1 if x == _ else 0)
#     for key in ['SMOKE','SCC','IsAging']:
#         _ = df[key][0]
#         df[key] = df[key].apply(lambda x: 0 if x == _ else 1)
#     for key in ['CAEC','CALC','MTRANS']:
#         df[key] = df[key].astype('category')
    df = pd.get_dummies(df, columns = categorical_attributes)
    df = df.drop(['id'], axis=1)
    if train:
#         labels = {'Insufficient_Weight' : 0, 'Normal_Weight':1, 'Obesity_Type_I':2, 'Obesity_Type_II':3,
#      'Obesity_Type_III':4, 'Overweight_Level_I':5, 'Overweight_Level_II':6}
#         y = [labels[x] for x in y]
        return df, y
    return df

In [9]:
X, y = data_cleaning(train,train=True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.3, random_state = 42)

In [11]:
del train, old_data
gc.collect()

30

In [12]:
params = {
    "objective": "multiclass",          # Objective function for the model
    "metric": "multi_logloss",          # Evaluation metric
    "verbosity": -1,                    # Verbosity level (-1 for silent)
    "boosting_type": "gbdt",            # Gradient boosting type
    "random_state": 42,       # Random state for reproducibility
    "num_class": 7,                     # Number of classes in the dataset
    'learning_rate': 0.03,  # Learning rate for gradient boosting
    'n_estimators': 500,                # Number of boosting iterations
    'lambda_l1': 0.0097,  # L1 regularization term
    'lambda_l2': 0.04,   # L2 regularization term
    'max_depth': 10,                    # Maximum depth of the trees
    'colsample_bytree': 0.41,  # Fraction of features to consider for each tree
    'subsample': 0.95,    # Fraction of samples to consider for each boosting iteration
    'min_child_samples': 26             # Minimum number of data needed in a leaf
}
lgbm_classifier = LGBMClassifier(**params)
lgbm_classifier.fit(X_train, y_train)
y_pred = lgbm_classifier.predict(X_test)

In [13]:
classification_report(y_test,lgbm_classifier.predict(X_test),output_dict=True)

{'Insufficient_Weight': {'precision': 0.9524375743162902,
  'recall': 0.9379391100702577,
  'f1-score': 0.9451327433628319,
  'support': 854},
 'Normal_Weight': {'precision': 0.8748796920115496,
  'recall': 0.8973346495557749,
  'f1-score': 0.8859649122807017,
  'support': 1013},
 'Obesity_Type_I': {'precision': 0.9261241970021413,
  'recall': 0.8710976837865055,
  'f1-score': 0.8977685521536067,
  'support': 993},
 'Obesity_Type_II': {'precision': 0.9615040286481648,
  'recall': 0.9790337283500455,
  'f1-score': 0.970189701897019,
  'support': 1097},
 'Obesity_Type_III': {'precision': 0.9984520123839009,
  'recall': 0.9961389961389961,
  'f1-score': 0.9972941631233089,
  'support': 1295},
 'Overweight_Level_I': {'precision': 0.7765293383270911,
  'recall': 0.8005148005148005,
  'f1-score': 0.788339670468948,
  'support': 777},
 'Overweight_Level_II': {'precision': 0.8303464755077659,
  'recall': 0.8353365384615384,
  'f1-score': 0.8328340323547033,
  'support': 832},
 'accuracy': 0.91

In [14]:
confusion_matrix(y_test,lgbm_classifier.predict(X_test))

array([[ 801,   49,    0,    0,    0,    3,    1],
       [  31,  909,    2,    0,    0,   63,    8],
       [   1,    2,  865,   33,    1,   32,   59],
       [   0,    0,   18, 1074,    1,    0,    4],
       [   1,    0,    2,    2, 1290,    0,    0],
       [   7,   65,   13,    0,    0,  622,   70],
       [   0,   14,   34,    8,    0,   81,  695]])

In [15]:
# xgb_clf = XGBClassifier(n_estimators=300, enable_categorical=True,objective = 'multi:softprob',
#                         colsample_bytree= 1.0, max_depth= 9, early_stopping_round=3, n_jobs=-1, gamma=0.05, verbose=3)

In [16]:
# xgb_clf.fit(X_train, y_train)

In [17]:
# classification_report(y_test,xgb_clf.predict(X_test),output_dict=True)

In [18]:
# xgb_clf.fit(X, y)

In [19]:
test = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
test = data_cleaning(test,train=False)

In [20]:
y_p = lgbm_classifier.predict(test)

In [21]:
# def delabel(y_p):
#     labels = {  0 : 'Insufficient_Weight', 1 : 'Normal_Weight', 2:'Obesity_Type_I', 3:'Obesity_Type_II',
#      4:'Obesity_Type_III', 5:'Overweight_Level_I', 6:'Overweight_Level_II'}
#     y_p = [labels[p] for p in y_p]
#     return y_p
# y_p = delabel(y_p)

In [22]:
submission = pd.DataFrame()
submission['Nobeyesdad'] = y_p
submission['id'] = range(20758,20758+len(y_p))
submission = submission.set_index('id')
submission

Unnamed: 0_level_0,Nobeyesdad
id,Unnamed: 1_level_1
20758,Obesity_Type_II
20759,Overweight_Level_I
20760,Obesity_Type_III
20761,Obesity_Type_I
20762,Obesity_Type_III
...,...
34593,Overweight_Level_II
34594,Overweight_Level_I
34595,Insufficient_Weight
34596,Insufficient_Weight


In [23]:
submission.to_csv("submission.csv",index='id')