In [2]:
import pandas as pd
import numpy as np
import re
import lightgbm as lgb
import keras
import optuna

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, minmax_scale
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from path import Path

In [3]:
class LGBConfig:
    input_path = Path('data')
    optuna_lgb = True
    n_estimators = 1500
    early_stopping_round = 150
    cv_folds = 5
    random_state = 55
    n_trials = 200
    params = {'objective': 'binary',
              'boosting_type': 'gbdt',
              'learning_rate': 0.07,
              'num_leaves': 19,
              'max_bin':33,
              'min_child_samples': 60,
              'colsample_bytree': 0.447,
              'subsample_freq': 0,
              'subsample': 0.6325,
              'reg_alpha': 0.002,
              'reg_lambda': 2.22,
              'verbosity': 0,
              'random_state': 0}
    
lgb_config = LGBConfig()

In [4]:
train = pd.read_csv(lgb_config.input_path/'train.csv')
test = pd.read_csv(lgb_config.input_path/'test.csv')
submission = pd.read_csv(lgb_config.input_path/'sample_submission.csv')

In [5]:
cat_features = ['HomePlanet','CryoSleep','VIP','Destination','Cabin_deck','Cabin_side']
train.describe(include='all')

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
count,8693,8492,8476,8494,8511,8514.0,8490,8512.0,8510.0,8485.0,8510.0,8505.0,8493,8693
unique,8693,3,2,6560,3,,2,,,,,,8473,2
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,,False,,,,,,Gollux Reedall,True
freq,1,4602,5439,8,5915,,8291,,,,,,2,4378
mean,,,,,,28.82793,,224.687617,458.077203,173.729169,311.138778,304.854791,,
std,,,,,,14.489021,,666.717663,1611.48924,604.696458,1136.705535,1145.717189,,
min,,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,
25%,,,,,,19.0,,0.0,0.0,0.0,0.0,0.0,,
50%,,,,,,27.0,,0.0,0.0,0.0,0.0,0.0,,
75%,,,,,,38.0,,47.0,76.0,27.0,59.0,46.0,,


In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
def data_preprocessing(train):
    train['Cabin'].fillna('/9999/ ', inplace=True)
    train['Cabin_deck'] = [i.split('/')[0] for i in train['Cabin'].values]
    train['Cabin_num'] = [i.split('/')[1] for i in train['Cabin'].values]
    train['Cabin_side'] = [i.split('/')[2] for i in train['Cabin'].values]
    train.drop(['PassengerId','Name','Cabin'], axis=1, inplace=True)
    for cat_feature in cat_features:
        train[cat_feature] = le.fit_transform(train[cat_feature])
    train['Cabin_num'] = train['Cabin_num'].astype(int)
                                                    
    return train
                                                    
train = data_preprocessing(train)
test = data_preprocessing(test)

In [7]:
x = train.drop('Transported', axis=1)
y = train['Transported'].astype(int)

In [8]:
# wrap the metric for lgb
def roc_auc_lgb(y_true, y_pred):
    eval_name = 'roc_auc'
    eval_result = roc_auc_score(y_true, y_pred)
    is_higher_better = True
    return eval_name, eval_result, is_higher_better

def accuracy_lgb(y_true, y_pred):
    eval_name = 'accuracy'
    eval_result = accuracy_score(y_true, [1 if i >0.5 else 0  for i in y_pred])
    is_higher_better = True
    return eval_name, eval_result, is_higher_better

In [9]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, accuracy_score
 
if lgb_config.optuna_lgb:
        
    def objective(trial):
        params = {
                'learning_rate': trial.suggest_float("learning_rate", 0.01, 1.0),
                'num_leaves': trial.suggest_int("num_leaves", 3, 255),
                'max_bin':trial.suggest_int("max_bin", 3, 50),
                'min_child_samples': trial.suggest_int("min_child_samples", 3, 200),
                'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1.0),
                'subsample_freq': trial.suggest_int("subsample_freq", 0, 10),
                'subsample': trial.suggest_float("subsample", 0.1, 1.0),
                'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-9, 10.0),
                'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-9, 10.0),
        }
        
        score = []
        skf = StratifiedKFold(n_splits=lgb_config.cv_folds, shuffle=True, random_state=lgb_config.random_state)
        kf = KFold(n_splits=lgb_config.cv_folds, shuffle=True, random_state=lgb_config.random_state)

        for train_idx, valid_idx in skf.split(x, y):
            x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
            x_valid, y_valid = x.iloc[valid_idx], y.iloc[valid_idx]

            model = lgb.LGBMClassifier(**params,
                                    n_estimators=lgb_config.n_estimators,
                                    early_stopping_round=lgb_config.early_stopping_round)

            callbacks=[lgb.early_stopping(stopping_rounds=lgb_config.early_stopping_round, verbose=False)]
            model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], eval_metric=roc_auc_lgb, callbacks=callbacks)
            score.append(model.best_score_['valid_0']['roc_auc'])
        return np.mean(score)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=lgb_config.n_trials, show_progress_bar=True)

    print("Best Score", study.best_value)
    print("Best parameters", study.best_params)
    
    params = {'objective': 'binary',
            'boosting_type': 'gbdt',
            'verbosity': 0,
            'random_state': lgb_config.random_state}
    
    params.update(study.best_params)
else:
    params = lgb_config.params

[I 2023-12-06 22:12:06,532] A new study created in memory with name: no-name-fb9480e2-83a4-4496-8948-acfe4a434c94


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-12-06 22:12:08,790] Trial 0 finished with value: 0.867168470550175 and parameters: {'learning_rate': 0.5219140003999628, 'num_leaves': 41, 'max_bin': 38, 'min_child_samples': 78, 'colsample_bytree': 0.9597553094366902, 'subsample_freq': 8, 'subsample': 0.11741470319284664, 'reg_alpha': 0.00039138925673684827, 'reg_lambda': 0.4071949399189786}. Best is trial 0 with value: 0.867168470550175.
[I 2023-12-06 22:12:10,705] Trial 1 finished with value: 0.8772782499193109 and parameters: {'learning_rate': 0.669632065364168, 'num_leaves': 138, 'max_bin': 44, 'min_child_samples': 156, 'colsample_bytree': 0.8587532727274455, 'subsample_freq': 2, 'subsample': 0.28833333812476136, 'reg_alpha': 5.892849774106706e-05, 'reg_lambda': 0.1499528293659541}. Best is trial 1 with value: 0.8772782499193109.


: 

In [63]:
preds = np.zeros(len(test))
oof = np.zeros(len(train)) # oof 是模型对每一个val预测加总，加起来就是对整个set的预测。
metric_evaluations = []

skf = StratifiedKFold(n_splits=lgb_config.cv_folds, shuffle=True, random_state=lgb_config.random_state)
kf = KFold(n_splits=lgb_config.cv_folds, shuffle=True, random_state=lgb_config.random_state)

for idx, (train_idx, valid_idx) in enumerate(skf.split(x, y)):
    print(f"CV fold {idx}")
    x_train, y_train = x.iloc[train_idx], y.iloc[train_idx]
    x_valid, y_valid = x.iloc[valid_idx], y.iloc[valid_idx]
    
    model = lgb.LGBMClassifier(**params,
                               n_estimators=lgb_config.n_estimators,
                               early_stopping_round=lgb_config.early_stopping_round,
                               force_row_wise=True)
    
    callbacks=[lgb.early_stopping(stopping_rounds=lgb_config.early_stopping_round), 
               lgb.log_evaluation(period=100, show_stdv=False)]
                                                                                           
    model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], eval_metric=accuracy_lgb, callbacks=callbacks)
    metric_evaluations.append(model.best_score_['valid_0']['accuracy'])
    preds += model.predict_proba(test, num_iteration=model.best_iteration_)[:,1] / kf.n_splits
    oof[valid_idx] = model.predict_proba(x_valid, num_iteration=model.best_iteration_)[:,1]

CV fold 0
Training until validation scores don't improve for 150 rounds
[100]	valid_0's binary_logloss: 0.393229	valid_0's accuracy: 0.79931
[200]	valid_0's binary_logloss: 0.386574	valid_0's accuracy: 0.802185
[300]	valid_0's binary_logloss: 0.386737	valid_0's accuracy: 0.803335
CV fold 1
Training until validation scores don't improve for 150 rounds
[100]	valid_0's binary_logloss: 0.374759	valid_0's accuracy: 0.813111
[200]	valid_0's binary_logloss: 0.367313	valid_0's accuracy: 0.817136
[300]	valid_0's binary_logloss: 0.36779	valid_0's accuracy: 0.815411
CV fold 2
Training until validation scores don't improve for 150 rounds
[100]	valid_0's binary_logloss: 0.403553	valid_0's accuracy: 0.792409
[200]	valid_0's binary_logloss: 0.39853	valid_0's accuracy: 0.79931
[300]	valid_0's binary_logloss: 0.39756	valid_0's accuracy: 0.80391
[400]	valid_0's binary_logloss: 0.401185	valid_0's accuracy: 0.80046
Early stopping, best iteration is:
[271]	valid_0's binary_logloss: 0.397017	valid_0's accur

In [1]:
print(f"LightGBM Score: {np.mean(metric_evaluations):0.3f} ({np.std(metric_evaluations):0.3f})")

NameError: name 'np' is not defined

In [None]:
# 输出预测
submission = pd.read_csv(lgb_config.input_path/'sample_submission.csv')
submission['Transported'] = np.where(preds>0.55,True,False)
submission.to_csv('lgb_submission.csv', index=False)