In [1]:
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score, f1_score, auc, classification_report, log_loss
from scipy.stats import ks_2samp

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/sample_submission.csv")

In [3]:
train.drop("id", axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

In [4]:
train['BMI'] = train['Weight'] / (train['Height']**2)
test['BMI'] = test['Weight'] / (test['Height']**2)

In [5]:
nobeyesdad_mapping = {'Insufficient_Weight': 0, 
                      'Normal_Weight': 1, 
                      'Overweight_Level_I': 2, 
                      'Overweight_Level_II': 3, 
                      'Obesity_Type_I': 4, 
                      'Obesity_Type_II': 5, 
                      'Obesity_Type_III': 6}

train['NObeyesdad'] = train['NObeyesdad'].map(nobeyesdad_mapping)

In [6]:
all = pd.concat([train, test], sort=False).reset_index(drop=True)
all.head(1)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad,BMI
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,3.0,28.259565


In [7]:
categories = all.columns[all.dtypes=="object"]
print(categories)

Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS'],
      dtype='object')


In [8]:
for cat in categories:
    le = LabelEncoder()
    print(cat)
    if all[cat].dtypes == "object":
        le = le.fit(all[cat])
        all[cat] = le.transform(all[cat])

Gender
family_history_with_overweight
FAVC
CAEC
SMOKE
SCC
CALC
MTRANS


In [9]:
train_X = all[~all["NObeyesdad"].isnull()].drop("NObeyesdad", axis=1).reset_index(drop=True)
train_Y = train["NObeyesdad"]
test_X = all[all["NObeyesdad"].isnull()].drop("NObeyesdad", axis=1).reset_index(drop=True)

In [10]:
X_tr, X_val, y_tr, y_val = train_test_split(train_X, train_Y, test_size=0.2,
                                                      random_state=1234, shuffle=True)

In [11]:
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((16606, 17), (4152, 17), (16606,), (4152,))

In [12]:
import optuna
ran_optuna = False

In [13]:
# def objective(trial):
#     params = {
#         'objective': 'multiclass',
#         'metric': 'multi_logloss',
#         'verbosity': -1,
#         'boosting_type': 'gbdt',
#         'random_state': 42,
#         'num_class': 7,
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05),
#         'n_estimators': trial.suggest_int('n_estimators', 300, 700),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-1, 10.0),
#         'max_depth': trial.suggest_int('max_depth', 5, 20),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.9),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'min_child_samples': trial.suggest_int('min_child_samples', 10, 100)
#     }

#     model = lgb.LGBMClassifier(**params)
#     model.fit(X_tr, y_tr)

#     y_pred = model.predict_proba(X_val)

#     logloss = log_loss(y_val, y_pred)

#     return logloss

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=300)

# print('Best trial:')
# trial = study.best_trial

# print('Logloss: {}'.format(trial.value))
# print("Best hyperparameters: {}".format(trial.params))

In [14]:
lgb_params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': 42,
    'num_class': 7,
    'learning_rate': 0.017284368958192655,
    'n_estimators': 665,
    'lambda_l1': 0.39888376039448153,
    'lambda_l2': 0.33194896131671575,
    'max_depth': 17,
    'colsample_bytree': 0.301610192089745,
    'subsample': 0.5443893888432019,
    'min_child_samples': 88
}

In [15]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

early_stopping_callback = lgb.early_stopping(stopping_rounds=10, verbose=False)

models = []
for train_index, val_index in kf.split(train_X, train_Y):  # train_Y 추가
    X_train = train_X.iloc[train_index]
    X_valid = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_valid = train_Y.iloc[val_index]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    model_lgb = lgb.train(lgb_params,
                          lgb_train,
                          valid_sets=lgb_eval,
                          num_boost_round=100,
                          callbacks=[early_stopping_callback])

    y_pred = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration)
    
    if y_pred.ndim > 1 and y_pred.shape[1] > 1:
        y_pred_labels = np.argmax(y_pred, axis=1)
    else: 
        y_pred_labels = (y_pred > 0.5).astype(int)
    
    accuracy = accuracy_score(y_valid, y_pred_labels)
    print(f'Accuracy: {accuracy}')
    models.append(model_lgb)

Accuracy: 0.9104046242774566
Accuracy: 0.9159441233140655
Accuracy: 0.9053468208092486
Accuracy: 0.9074921705613105
Accuracy: 0.9115875692604192


In [16]:
preds = []

for model in models:
    pred = model.predict(test_X)
    preds.append(pred)

In [17]:
y_pred = model_lgb.predict(test_X, num_iteration=model_lgb.best_iteration)
predicted_classes = np.argmax(y_pred, axis=1)
predicted_classes

array([5, 2, 6, ..., 0, 1, 5], dtype=int64)

In [18]:
inverse_nobeyesdad_mapping = {v: k for k, v in nobeyesdad_mapping.items()}
predicted_labels = [inverse_nobeyesdad_mapping[i] for i in predicted_classes]

In [19]:
submission["NObeyesdad"] = predicted_labels
submission.shape

(13840, 2)

In [20]:
submission.to_csv("submission_lgb_2.csv", index=False)