In [1]:
import numpy as np
import pandas as pd
import tqdm as tqdm
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
import gc

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e2/train.csv")
train.head(5)

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

In [4]:
def data_cleaning(df,train=False):
    numerical_attributes = [x for x in df.keys() if df.dtypes[x] == 'int64' or df.dtypes[x] == 'float64' ]
    scaler = StandardScaler()
    df[numerical_attributes] = scaler.fit_transform(df[numerical_attributes])
    for key in ['Gender','family_history_with_overweight','FAVC']:
        _ = df[key][0]
        df[key] = df[key].apply(lambda x: 1 if x == _ else 0)
    for key in ['SMOKE','SCC']:
        _ = df[key][0]
        df[key] = df[key].apply(lambda x: 0 if x == _ else 1)
    for key in ['CAEC','CALC','MTRANS']:
        df[key] = df[key].astype('category')
    df = df.drop(['id'], axis=1)
    #y = pd.get_dummies(df['NObeyesdad'],prefix='NObeyesdad',prefix_sep='_',dtype = 'int64')
    if train:
        labels = {'Insufficient_Weight' : 0, 'Normal_Weight':1, 'Obesity_Type_I':2, 'Obesity_Type_II':3,
     'Obesity_Type_III':4, 'Overweight_Level_I':5, 'Overweight_Level_II':6}
        y = [labels[x] for x in df['NObeyesdad']]
        df = df.drop(['NObeyesdad'], axis=1)
        return df, y
    return df

In [5]:
X, y = data_cleaning(train,train=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.125, random_state = 42)

In [7]:
del train
gc.collect()

0

In [8]:
xgb_clf = XGBClassifier(n_estimators=350, enable_categorical=True, objective = 'multi:softmax',
                        colsample_bytree= 0.75, max_depth= 7, early_stopping_round=3, n_jobs=-1, gamma=0.1, verbose=3)

In [9]:
xgb_clf.fit(X_train, y_train)

Parameters: { "early_stopping_round", "verbose" } are not used.



In [10]:
classification_report(y_test,xgb_clf.predict(X_test),output_dict=True)

{'0': {'precision': 0.9585987261146497,
  'recall': 0.9261538461538461,
  'f1-score': 0.9420970266040688,
  'support': 325},
 '1': {'precision': 0.8777506112469438,
  'recall': 0.8930348258706468,
  'f1-score': 0.8853267570900123,
  'support': 402},
 '2': {'precision': 0.8885542168674698,
  'recall': 0.8753709198813057,
  'f1-score': 0.8819133034379671,
  'support': 337},
 '3': {'precision': 0.9779411764705882,
  'recall': 0.9684466019417476,
  'f1-score': 0.973170731707317,
  'support': 412},
 '4': {'precision': 0.9960079840319361,
  'recall': 0.998,
  'f1-score': 0.997002997002997,
  'support': 500},
 '5': {'precision': 0.7866666666666666,
  'recall': 0.7866666666666666,
  'f1-score': 0.7866666666666666,
  'support': 300},
 '6': {'precision': 0.8066465256797583,
  'recall': 0.8369905956112853,
  'f1-score': 0.8215384615384616,
  'support': 319},
 'accuracy': 0.9078998073217727,
 'macro avg': {'precision': 0.8988808438682876,
  'recall': 0.8978090651607855,
  'f1-score': 0.89824513486

In [11]:
xgb_clf.fit(X, y)

Parameters: { "early_stopping_round", "verbose" } are not used.



In [12]:
test = pd.read_csv("/kaggle/input/playground-series-s4e2/test.csv")
test = data_cleaning(test,train=False)

In [13]:
y_p = xgb_clf.predict(test)

In [14]:
def delabel(y_p):
    labels = {  0 : 'Insufficient_Weight', 1 : 'Normal_Weight', 2:'Obesity_Type_I', 3:'Obesity_Type_II',
     4:'Obesity_Type_III', 5:'Overweight_Level_I', 6:'Overweight_Level_II'}
    y_p = [labels[p] for p in y_p]
    return y_p
y_p = delabel(y_p)

In [15]:
sub = pd.read_csv("/kaggle/input/playground-series-s4e2/sample_submission.csv")
sub['NObeyesdad'] = y_p
sub.to_csv("submission.csv",index='id')