In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from prettytable import PrettyTable

from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/playground-series-s4e2/sample_submission.csv
/kaggle/input/playground-series-s4e2/train.csv
/kaggle/input/playground-series-s4e2/test.csv


In [2]:
train=pd.read_csv('../input/playground-series-s4e2/train.csv')
test=pd.read_csv('../input/playground-series-s4e2/test.csv')

* FAVC: Frequent consumption of high caloric food
* FCVC: Frequency of consumption of vegetables
* NCP: Number of main meals
* CAEC: Consumption of food between meals
* SMOKE: Smoker or not
* CH2O: Consumption of water daily
* SCC: Self-Care Commitment
* FAF: Physical activity frequency
* TUE: Time using technology devices
* CALC: Consumption of alcohol
* MTRANS: Mode of transport
* NObeyesdad: Different levels of obesity

In [3]:
# Features to yesno encode:
yesno_encode = ['family_history_with_overweight', 'SMOKE', 'SCC', 'FAVC']

# Features to One-Hot encode: 
oh_encode = ['MTRANS']

# NObeyesdad mapping
NObeyesdad_mapping = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
for column in yesno_encode:
    train[column] = train[column].map({'no': 0, 'yes': 1})
    
train['Gender'] = train['Gender'].map({'Male': 0, 'Female': 1})

train = pd.get_dummies(train, columns=oh_encode, prefix=oh_encode[0])

train = train.drop(['id'], axis=1)

train['CAEC'] = train['CAEC'].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})
train['CALC'] = train['CALC'].map({'no': 0, 'Sometimes': 1, 'Frequently': 2})
train['NObeyesdad'] = train['NObeyesdad'].map(NObeyesdad_mapping)
import pandas as pd

min_age = train['Age'].min()
max_age = train['Age'].max()

bins = [min_age-1, 20, 30, 40, max_age+1]

labels = [f'{min_age}-20', '21-30', '31-40', f'41-{max_age}']

# Use pd.cut to categorize 'Age' values with the corrected labels
train['Age_Group'] = pd.cut(train['Age'], bins=bins, labels=labels)

# Apply one-hot encoding to the 'Age_Group' column
train = pd.get_dummies(train, columns=['Age_Group'], prefix='Age')

dummy_columns = [f'Age_{label}' for label in labels]

# train[dummy_columns] = train[dummy_columns].astype(int)
min_weight = train['Weight'].min()
max_weight = train['Weight'].max()

weight_bins = np.linspace(min_weight, max_weight, num=5) 
weight_labels = ['Part 1', 'Part 2', 'Part 3', 'Part 4']
train['Weight_Part'] = pd.cut(train['Weight'], bins=weight_bins, labels=weight_labels, include_lowest=True)

train = pd.get_dummies(train, columns=['Weight_Part'], prefix='Weight')
std_dev_height = train['Height'].std()
mean_height = train['Height'].mean()
# Split height into categories based on standard deviations
bins = [train['Height'].min()] + [mean_height - 2 * std_dev_height] + [mean_height - std_dev_height] + \
       [mean_height + std_dev_height] + [mean_height + 2 * std_dev_height] + [train['Height'].max()]
labels = ['Part 1', 'Part 2', 'Part 3', 'Part 4', 'Part 5']
train['Height_Std_Dev'] = pd.cut(train['Height'], bins=bins, labels=labels, include_lowest=True)

# One-hot encode the Height_Std_Dev feature
train = pd.get_dummies(train, columns=['Height_Std_Dev'], prefix='Height_Std_Dev')
bin_edges = [1, 2, 3.1]

bin_labels = ['1 to lessthan_2', '2 to 3']
train['FCVC_Group'] = pd.cut(train['FCVC'], bins=bin_edges, labels=bin_labels, right=False)
train = pd.get_dummies(train, columns=['FCVC_Group'])
conditions = [
    (train['NCP'] < 2),
    (train['NCP'] >= 2) & (train['NCP'] < 3),
    (train['NCP'] >= 3) & (train['NCP'] <= 4),
]

train['NCP_Group'] = np.select(conditions, ['under_2', 'under_3', '3_to_4'], default='NaN')
train = pd.get_dummies(train, columns=['NCP_Group'])
bin_edges = [1, 1.5, 2, 2.5, 3.1]

bin_labels = ['1-1.5', 'above_1.5-2', 'above_2-2.5', 'above_2.5-3']

train['CH2O_Group'] = pd.cut(train['CH2O'], bins=bin_edges, labels=bin_labels, right=False)
train = pd.get_dummies(train, columns=['CH2O_Group'])
labels = ['0', 'above_0-below_1', '1-below_2', '2-3']

conditions = [
    (train['FAF'] == 0),
    (train['FAF'] > 0) & (train['FAF'] < 1),
    (train['FAF'] >= 1) & (train['FAF'] < 2),
    (train['FAF'] >= 2) & (train['FAF'] <= 3),
]

train['FAF_Group'] = np.select(conditions, labels, default='NaN')
train = pd.get_dummies(train, columns=['FAF_Group'])
bins = [0, 1, 2.1]  
labels = ['0-1', 'above_1-2']

train['TUE_Group'] = pd.cut(train['TUE'], bins=bins, labels=labels, right=False)

train = pd.get_dummies(train, columns=['TUE_Group'])

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
train['BMI'] = train['Weight'] / (train['Height'] ** 2)

# X[X.columns] = scaler.fit_transform(X[X.columns])
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Assuming 'train' is your DataFrame and it has a column 'BMI' with the BMI values

# Define a function to map BMI values to categories
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Insufficient_Weight'
    elif bmi < 25:
        return 'Normal_Weight'
    elif bmi < 27.2:
        return 'Overweight_Level_I'
    elif bmi < 30:
        return 'Overweight_Level_I'
    elif bmi < 35:
        return 'Obesity_Type_I'
    elif bmi < 40:
        return 'Obesity_Type_II'
    else:
        return 'Obesity_Type_III'

# Apply the function to create a new column with the categories
train['BMI_Category'] = train['BMI'].apply(categorize_bmi)

train = pd.get_dummies(train, columns=['BMI_Category'])
X = train.drop(['NObeyesdad'], axis=1)
y = train['NObeyesdad']

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 55 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Gender                            20758 non-null  int64  
 1   Age                               20758 non-null  float64
 2   Height                            20758 non-null  float64
 3   Weight                            20758 non-null  float64
 4   family_history_with_overweight    20758 non-null  int64  
 5   FAVC                              20758 non-null  int64  
 6   FCVC                              20758 non-null  float64
 7   NCP                               20758 non-null  float64
 8   CAEC                              20758 non-null  int64  
 9   SMOKE                             20758 non-null  int64  
 10  CH2O                              20758 non-null  float64
 11  SCC                               20758 non-null  int64  
 12  FAF 

In [5]:
def objective_xgboost(trial):
    param = {
    'tree_method': 'hist',  # Updated to use 'hist' as the tree method
    'device': 'cuda',
    "objective": "multiclass",         
    'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
    'max_depth': trial.suggest_int('max_depth', 2, 10),
    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    'gamma': trial.suggest_float('gamma', 0, 5),
#     'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
#     'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    
    clf = XGBClassifier(**param, random_state=42, use_label_encoder=False, eval_metric='logloss')
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    return np.mean(scores)

def objective_lightgbm(trial):
    param = {
    'device': 'gpu',  # Use GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    "objective": "multiclass",         
    'num_leaves': trial.suggest_int('num_leaves', 100, 400),
    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 200, 400),
    'max_depth': trial.suggest_int('max_depth', 2, 10),
    'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
    'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
    'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
    'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1, log=True),
    'n_estimators': trial.suggest_int('n_estimators', 500, 1000)
    }
    
    clf = LGBMClassifier(**param, random_state=42, verbose=-1)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    return np.mean(scores)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgboost, n_trials=15)  

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lightgbm, n_trials=15)
# After optimization
print("Best trial for XGBoost:")
best_trial_xgb = study_xgb.best_trial
print(f"  Value (Accuracy): {best_trial_xgb.value}")
print(f"  Params: ")
for key, value in best_trial_xgb.params.items():
    print(f"    {key}: {value}")

print("\nBest trial for LightGBM:")
best_trial_lgb = study_lgb.best_trial
print(f"  Value (Accuracy): {best_trial_lgb.value}")
print(f"  Params: ")
for key, value in best_trial_lgb.params.items():
    print(f"    {key}: {value}")

[I 2024-02-16 20:54:10,019] A new study created in memory with name: no-name-df8921b9-eee1-48d1-aa29-0fc71eb19675
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2024-02-16 20:54:25,141] Trial 0 finished with value: 0.8864533407630993 and parameters: {'n_estimators': 798, 'max_depth': 5, 'learning_rate': 0.024488866014490154, 'subsample': 0.7104526464856715, 'colsample_bytree': 0.940859629435955, 'min_child_weight': 126, 'gamma': 2.6351934785778464}. Best is trial 0 with value: 0.8864533407630993.
[I 2024-02-16 20:54:32,642] Trial 1 finished with value: 0.8821176698939365 and parameters: {'n_estimators': 509, 'max_depth': 4, 'learning_rate': 0.13282840533524243, 'subsample': 0.8361162519467047, 'colsample_bytree': 0.6280264320113457, 'min_child_weight': 185, 'gamma': 2.8807960942033892}. Best is trial 0 with value: 0.8864533407630993.
[I 2024-02-16 20:54:42,215] Trial 2 finishe

Best trial for XGBoost:
  Value (Accuracy): 0.9023026565869136
  Params: 
    n_estimators: 947
    max_depth: 2
    learning_rate: 0.4582370203850386
    subsample: 0.8588524812643338
    colsample_bytree: 0.7153960299369079
    min_child_weight: 20
    gamma: 0.3553545874848334

Best trial for LightGBM:
  Value (Accuracy): 0.907264650047717
  Params: 
    num_leaves: 167
    min_data_in_leaf: 326
    max_depth: 7
    feature_fraction: 0.5650109761829814
    bagging_fraction: 0.9998950329043939
    bagging_freq: 7
    lambda_l1: 6.926171330730737e-05
    lambda_l2: 0.0015839201547353037
    learning_rate: 0.031089974530760678
    n_estimators: 505


In [6]:
train=pd.read_csv('../input/playground-series-s4e2/train.csv')
test=pd.read_csv('../input/playground-series-s4e2/test.csv')

In [7]:
# Features to yesno encode:
yesno_encode = ['family_history_with_overweight', 'SMOKE', 'SCC', 'FAVC']

# Features to One-Hot encode: 
oh_encode = ['MTRANS']

# NObeyesdad mapping
NObeyesdad_mapping = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
for column in yesno_encode:
    train[column] = train[column].map({'no': 0, 'yes': 1})
    
train['Gender'] = train['Gender'].map({'Male': 0, 'Female': 1})

train = pd.get_dummies(train, columns=oh_encode, prefix=oh_encode[0])

train = train.drop(['id'], axis=1)

train['CAEC'] = train['CAEC'].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})
train['CALC'] = train['CALC'].map({'no': 0, 'Sometimes': 1, 'Frequently': 2})
train['NObeyesdad'] = train['NObeyesdad'].map(NObeyesdad_mapping)
import pandas as pd

min_age = train['Age'].min()
max_age = train['Age'].max()

bins = [min_age-1, 20, 30, 40, max_age+1]

labels = [f'{min_age}-20', '21-30', '31-40', f'41-{max_age}']

# Use pd.cut to categorize 'Age' values with the corrected labels
train['Age_Group'] = pd.cut(train['Age'], bins=bins, labels=labels)

# Apply one-hot encoding to the 'Age_Group' column
train = pd.get_dummies(train, columns=['Age_Group'], prefix='Age')

dummy_columns = [f'Age_{label}' for label in labels]

# train[dummy_columns] = train[dummy_columns].astype(int)
min_weight = train['Weight'].min()
max_weight = train['Weight'].max()

weight_bins = np.linspace(min_weight, max_weight, num=5) 
weight_labels = ['Part 1', 'Part 2', 'Part 3', 'Part 4']
train['Weight_Part'] = pd.cut(train['Weight'], bins=weight_bins, labels=weight_labels, include_lowest=True)

train = pd.get_dummies(train, columns=['Weight_Part'], prefix='Weight')
std_dev_height = train['Height'].std()
mean_height = train['Height'].mean()
# Split height into categories based on standard deviations
bins = [train['Height'].min()] + [mean_height - 2 * std_dev_height] + [mean_height - std_dev_height] + \
       [mean_height + std_dev_height] + [mean_height + 2 * std_dev_height] + [train['Height'].max()]
labels = ['Part 1', 'Part 2', 'Part 3', 'Part 4', 'Part 5']
train['Height_Std_Dev'] = pd.cut(train['Height'], bins=bins, labels=labels, include_lowest=True)

# One-hot encode the Height_Std_Dev feature
train = pd.get_dummies(train, columns=['Height_Std_Dev'], prefix='Height_Std_Dev')
bin_edges = [1, 2, 3.1]

bin_labels = ['1 to lessthan_2', '2 to 3']
train['FCVC_Group'] = pd.cut(train['FCVC'], bins=bin_edges, labels=bin_labels, right=False)
train = pd.get_dummies(train, columns=['FCVC_Group'])
conditions = [
    (train['NCP'] < 2),
    (train['NCP'] >= 2) & (train['NCP'] < 3),
    (train['NCP'] >= 3) & (train['NCP'] <= 4),
]

train['NCP_Group'] = np.select(conditions, ['under_2', 'under_3', '3_to_4'], default='NaN')
train = pd.get_dummies(train, columns=['NCP_Group'])
bin_edges = [1, 1.5, 2, 2.5, 3.1]

bin_labels = ['1-1.5', 'above_1.5-2', 'above_2-2.5', 'above_2.5-3']

train['CH2O_Group'] = pd.cut(train['CH2O'], bins=bin_edges, labels=bin_labels, right=False)
train = pd.get_dummies(train, columns=['CH2O_Group'])
labels = ['0', 'above_0-below_1', '1-below_2', '2-3']

conditions = [
    (train['FAF'] == 0),
    (train['FAF'] > 0) & (train['FAF'] < 1),
    (train['FAF'] >= 1) & (train['FAF'] < 2),
    (train['FAF'] >= 2) & (train['FAF'] <= 3),
]

train['FAF_Group'] = np.select(conditions, labels, default='NaN')
train = pd.get_dummies(train, columns=['FAF_Group'])
bins = [0, 1, 2.1]  
labels = ['0-1', 'above_1-2']

train['TUE_Group'] = pd.cut(train['TUE'], bins=bins, labels=labels, right=False)

train = pd.get_dummies(train, columns=['TUE_Group'])

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
train['BMI'] = train['Weight'] / (train['Height'] ** 2)

X = train.drop(['NObeyesdad'], axis=1)
y = train['NObeyesdad']

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 49 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          20758 non-null  int64  
 1   Age                             20758 non-null  float64
 2   Height                          20758 non-null  float64
 3   Weight                          20758 non-null  float64
 4   family_history_with_overweight  20758 non-null  int64  
 5   FAVC                            20758 non-null  int64  
 6   FCVC                            20758 non-null  float64
 7   NCP                             20758 non-null  float64
 8   CAEC                            20758 non-null  int64  
 9   SMOKE                           20758 non-null  int64  
 10  CH2O                            20758 non-null  float64
 11  SCC                             20758 non-null  int64  
 12  FAF                             

In [9]:
def objective_xgboost(trial):
    param = {
    'tree_method': 'hist',  # Updated to use 'hist' as the tree method
    'device': 'cuda',
    "objective": "multiclass",         
    'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
    'max_depth': trial.suggest_int('max_depth', 2, 10),
    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    'gamma': trial.suggest_float('gamma', 0, 5),
#     'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
#     'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    
    clf = XGBClassifier(**param, random_state=42, use_label_encoder=False, eval_metric='logloss')
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    return np.mean(scores)

def objective_lightgbm(trial):
    param = {
    'device': 'gpu',  # Use GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    "objective": "multiclass",         
    'num_leaves': trial.suggest_int('num_leaves', 100, 400),
    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 200, 400),
    'max_depth': trial.suggest_int('max_depth', 2, 10),
    'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
    'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
    'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
    'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1, log=True),
    'n_estimators': trial.suggest_int('n_estimators', 500, 1000)
    }
    
    clf = LGBMClassifier(**param, random_state=42, verbose=-1)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    return np.mean(scores)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgboost, n_trials=15)  

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lightgbm, n_trials=15)
# After optimization
print("Best trial for XGBoost:")
best_trial_xgb = study_xgb.best_trial
print(f"  Value (Accuracy): {best_trial_xgb.value}")
print(f"  Params: ")
for key, value in best_trial_xgb.params.items():
    print(f"    {key}: {value}")

print("\nBest trial for LightGBM:")
best_trial_lgb = study_lgb.best_trial
print(f"  Value (Accuracy): {best_trial_lgb.value}")
print(f"  Params: ")
for key, value in best_trial_lgb.params.items():
    print(f"    {key}: {value}")

[I 2024-02-16 21:07:00,141] A new study created in memory with name: no-name-4cd9a15f-98a4-4da4-af67-559833f8a6d6
[I 2024-02-16 21:07:04,993] Trial 0 finished with value: 0.8884284387939606 and parameters: {'n_estimators': 320, 'max_depth': 8, 'learning_rate': 0.2621337681731364, 'subsample': 0.5694559920759198, 'colsample_bytree': 0.5981622200443464, 'min_child_weight': 108, 'gamma': 3.9438807686464212}. Best is trial 0 with value: 0.8884284387939606.
[I 2024-02-16 21:07:12,369] Trial 1 finished with value: 0.9033623659516213 and parameters: {'n_estimators': 480, 'max_depth': 8, 'learning_rate': 0.16505119144678818, 'subsample': 0.762827768221431, 'colsample_bytree': 0.6195949690645544, 'min_child_weight': 12, 'gamma': 3.1546877091920758}. Best is trial 1 with value: 0.9033623659516213.
[I 2024-02-16 21:07:23,281] Trial 2 finished with value: 0.8881393936052215 and parameters: {'n_estimators': 824, 'max_depth': 9, 'learning_rate': 0.3754821180918954, 'subsample': 0.697085093076591, 'c

Best trial for XGBoost:
  Value (Accuracy): 0.904470398035512
  Params: 
    n_estimators: 701
    max_depth: 5
    learning_rate: 0.1494436627030372
    subsample: 0.9652472672579715
    colsample_bytree: 0.7497711520812321
    min_child_weight: 19
    gamma: 1.188606328832516

Best trial for LightGBM:
  Value (Accuracy): 0.9071682761244135
  Params: 
    num_leaves: 186
    min_data_in_leaf: 382
    max_depth: 10
    feature_fraction: 0.6045483452010101
    bagging_fraction: 0.5229943685343548
    bagging_freq: 4
    lambda_l1: 0.011192190982014624
    lambda_l2: 1.2467559826220523e-08
    learning_rate: 0.017958218859871474
    n_estimators: 962


In [10]:
train=pd.read_csv('../input/playground-series-s4e2/train.csv')
test=pd.read_csv('../input/playground-series-s4e2/test.csv')

In [11]:
# Features to yesno encode:
yesno_encode = ['family_history_with_overweight', 'SMOKE', 'SCC', 'FAVC']

# Features to One-Hot encode: 
oh_encode = ['MTRANS']

# NObeyesdad mapping
NObeyesdad_mapping = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
for column in yesno_encode:
    train[column] = train[column].map({'no': 0, 'yes': 1})
    
train['Gender'] = train['Gender'].map({'Male': 0, 'Female': 1})

train = pd.get_dummies(train, columns=oh_encode, prefix=oh_encode[0])

train = train.drop(['id'], axis=1)

train['CAEC'] = train['CAEC'].map({'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3})
train['CALC'] = train['CALC'].map({'no': 0, 'Sometimes': 1, 'Frequently': 2})
train['NObeyesdad'] = train['NObeyesdad'].map(NObeyesdad_mapping)
import pandas as pd

min_age = train['Age'].min()
max_age = train['Age'].max()

bins = [min_age-1, 20, 30, 40, max_age+1]

labels = [f'{min_age}-20', '21-30', '31-40', f'41-{max_age}']

# Use pd.cut to categorize 'Age' values with the corrected labels
train['Age_Group'] = pd.cut(train['Age'], bins=bins, labels=labels)

# Apply one-hot encoding to the 'Age_Group' column
train = pd.get_dummies(train, columns=['Age_Group'], prefix='Age')

dummy_columns = [f'Age_{label}' for label in labels]

# train[dummy_columns] = train[dummy_columns].astype(int)
min_weight = train['Weight'].min()
max_weight = train['Weight'].max()

weight_bins = np.linspace(min_weight, max_weight, num=5) 
weight_labels = ['Part 1', 'Part 2', 'Part 3', 'Part 4']
train['Weight_Part'] = pd.cut(train['Weight'], bins=weight_bins, labels=weight_labels, include_lowest=True)

train = pd.get_dummies(train, columns=['Weight_Part'], prefix='Weight')
std_dev_height = train['Height'].std()
mean_height = train['Height'].mean()
# Split height into categories based on standard deviations
bins = [train['Height'].min()] + [mean_height - 2 * std_dev_height] + [mean_height - std_dev_height] + \
       [mean_height + std_dev_height] + [mean_height + 2 * std_dev_height] + [train['Height'].max()]
labels = ['Part 1', 'Part 2', 'Part 3', 'Part 4', 'Part 5']
train['Height_Std_Dev'] = pd.cut(train['Height'], bins=bins, labels=labels, include_lowest=True)

# One-hot encode the Height_Std_Dev feature
train = pd.get_dummies(train, columns=['Height_Std_Dev'], prefix='Height_Std_Dev')
bin_edges = [1, 2, 3.1]

bin_labels = ['1 to lessthan_2', '2 to 3']
train['FCVC_Group'] = pd.cut(train['FCVC'], bins=bin_edges, labels=bin_labels, right=False)
train = pd.get_dummies(train, columns=['FCVC_Group'])
conditions = [
    (train['NCP'] < 2),
    (train['NCP'] >= 2) & (train['NCP'] < 3),
    (train['NCP'] >= 3) & (train['NCP'] <= 4),
]

train['NCP_Group'] = np.select(conditions, ['under_2', 'under_3', '3_to_4'], default='NaN')
train = pd.get_dummies(train, columns=['NCP_Group'])
bin_edges = [1, 1.5, 2, 2.5, 3.1]

bin_labels = ['1-1.5', 'above_1.5-2', 'above_2-2.5', 'above_2.5-3']

train['CH2O_Group'] = pd.cut(train['CH2O'], bins=bin_edges, labels=bin_labels, right=False)
train = pd.get_dummies(train, columns=['CH2O_Group'])
labels = ['0', 'above_0-below_1', '1-below_2', '2-3']

conditions = [
    (train['FAF'] == 0),
    (train['FAF'] > 0) & (train['FAF'] < 1),
    (train['FAF'] >= 1) & (train['FAF'] < 2),
    (train['FAF'] >= 2) & (train['FAF'] <= 3),
]

train['FAF_Group'] = np.select(conditions, labels, default='NaN')
train = pd.get_dummies(train, columns=['FAF_Group'])
bins = [0, 1, 2.1]  
labels = ['0-1', 'above_1-2']

train['TUE_Group'] = pd.cut(train['TUE'], bins=bins, labels=labels, right=False)

train = pd.get_dummies(train, columns=['TUE_Group'])

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

X = train.drop(['NObeyesdad'], axis=1)
y = train['NObeyesdad']

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 48 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          20758 non-null  int64  
 1   Age                             20758 non-null  float64
 2   Height                          20758 non-null  float64
 3   Weight                          20758 non-null  float64
 4   family_history_with_overweight  20758 non-null  int64  
 5   FAVC                            20758 non-null  int64  
 6   FCVC                            20758 non-null  float64
 7   NCP                             20758 non-null  float64
 8   CAEC                            20758 non-null  int64  
 9   SMOKE                           20758 non-null  int64  
 10  CH2O                            20758 non-null  float64
 11  SCC                             20758 non-null  int64  
 12  FAF                             

In [13]:
def objective_xgboost(trial):
    param = {
    'tree_method': 'hist',  # Updated to use 'hist' as the tree method
    'device': 'cuda',
    "objective": "multiclass",         
    'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
    'max_depth': trial.suggest_int('max_depth', 2, 10),
    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5, log=True),
    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    'gamma': trial.suggest_float('gamma', 0, 5),
#     'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
#     'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    
    clf = XGBClassifier(**param, random_state=42, use_label_encoder=False, eval_metric='logloss')
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    return np.mean(scores)

def objective_lightgbm(trial):
    param = {
    'device': 'gpu',  # Use GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    "objective": "multiclass",         
    'num_leaves': trial.suggest_int('num_leaves', 100, 400),
    'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 200, 400),
    'max_depth': trial.suggest_int('max_depth', 2, 10),
    'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
    'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
    'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
    'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1, log=True),
    'n_estimators': trial.suggest_int('n_estimators', 500, 1000)
    }
    
    clf = LGBMClassifier(**param, random_state=42, verbose=-1)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    
    return np.mean(scores)

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgboost, n_trials=15)  

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lightgbm, n_trials=15)
# After optimization
print("Best trial for XGBoost:")
best_trial_xgb = study_xgb.best_trial
print(f"  Value (Accuracy): {best_trial_xgb.value}")
print(f"  Params: ")
for key, value in best_trial_xgb.params.items():
    print(f"    {key}: {value}")

print("\nBest trial for LightGBM:")
best_trial_lgb = study_lgb.best_trial
print(f"  Value (Accuracy): {best_trial_lgb.value}")
print(f"  Params: ")
for key, value in best_trial_lgb.params.items():
    print(f"    {key}: {value}")

[I 2024-02-16 21:21:32,803] A new study created in memory with name: no-name-fc9365f5-3189-4b1a-aca4-66235edc3fea
[I 2024-02-16 21:21:44,003] Trial 0 finished with value: 0.8984004838816128 and parameters: {'n_estimators': 708, 'max_depth': 6, 'learning_rate': 0.0661101219283377, 'subsample': 0.6377496064570045, 'colsample_bytree': 0.6266119715486889, 'min_child_weight': 63, 'gamma': 2.681988256525682}. Best is trial 0 with value: 0.8984004838816128.
[I 2024-02-16 21:21:59,549] Trial 1 finished with value: 0.8792754571826316 and parameters: {'n_estimators': 993, 'max_depth': 2, 'learning_rate': 0.20531526245061382, 'subsample': 0.894021623549951, 'colsample_bytree': 0.63370872732861, 'min_child_weight': 262, 'gamma': 0.000967711721840403}. Best is trial 0 with value: 0.8984004838816128.
[I 2024-02-16 21:22:07,476] Trial 2 finished with value: 0.8639079970514857 and parameters: {'n_estimators': 499, 'max_depth': 3, 'learning_rate': 0.46312507788499296, 'subsample': 0.7295884485391909, '

Best trial for XGBoost:
  Value (Accuracy): 0.9073126942899129
  Params: 
    n_estimators: 714
    max_depth: 10
    learning_rate: 0.02375022730516699
    subsample: 0.5040787235960891
    colsample_bytree: 0.5018650877369993
    min_child_weight: 1
    gamma: 1.6773255256851183

Best trial for LightGBM:
  Value (Accuracy): 0.90745727257968
  Params: 
    num_leaves: 336
    min_data_in_leaf: 334
    max_depth: 10
    feature_fraction: 0.7139243917899806
    bagging_fraction: 0.8435783424786866
    bagging_freq: 2
    lambda_l1: 1.502060499957881e-06
    lambda_l2: 0.07274819359575113
    learning_rate: 0.025162153450037677
    n_estimators: 650
