Seung-Hwan Oh, Seoul, Korea

Modified by Seung-Hwan Oh 2022.12.27

Do not copy without permission

In [94]:
import pandas as pd
import random
import os
import numpy as np
from sklearn import preprocessing

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [95]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [96]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id'])
        return df_x

In [97]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

## Label-Encoding

In [98]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [99]:
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

In [100]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

LabelEncoder()

In [101]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

In [12]:
#x_train=train_x.iloc[:int(len(train_x)*0.8),:]
#x_val=train_x.iloc[int(len(train_x)*0.8):,:]
#y_train=train_y[:int(len(train_y)*0.8)]
#y_val=train_y[int(len(train_y)*0.8):]

# 1. LGB

In [25]:
from sklearn.model_selection import KFold
import lightgbm as lgb  
from sklearn.metrics import f1_score
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda
        )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict(x_valid)
        
        score += f1_score(pred,y_valid, average='macro')/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [None]:
from functools import partial  
from bayes_opt import BayesianOptimization

# 모델과 관련없는 변수 고정
func_fixed = partial(lgb_cv, x_data=train_x, y_data=train_y, n_splits=5, output='score') 
# 베이지안 최적화 범위 설정
lgbBO = BayesianOptimization(
    func_fixed, 
    {
        'num_leaves': (16, 1024),        # num_leaves,       범위(16~1024)
        'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (16, 1024),      # n_estimators,     범위(16~1024)
        'subsample': (0, 1),             # subsample,        범위(0~1)
        'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),            # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),           # reg_lambda,       범위(0~50)
    }, 
    random_state=4321                    # 시드 고정
)
lgbBO.maximize(init_points=10, n_iter=30) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

In [None]:
#  colsam... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample
#  0.2182    | 0.03022   | 877.3     | 28.61     | 0.06825   | 1.223     | 0.9198 

model = lgb.LGBMClassifier(
    num_leaves = int(28.61), 
    learning_rate = 0.03022, 
    n_estimators = int(877), 
    subsample = np.clip(0.9198, 0, 1), 
    colsample_bytree = np.clip(0.2182, 0, 1),  
    reg_alpha = 0.06825, 
    reg_lambda = 1.223
)

model.fit(train_x, train_y)

preds = model.predict(test_x)


# 2. XGB

In [28]:
from sklearn.model_selection import KFold
import xgboost as xgb 
from sklearn.metrics import f1_score
def xgb_cv(max_depth,learning_rate,n_estimators,reg_alpha, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]
        
        model = xgb.XGBClassifier(max_depth=int(max_depth),
                                           learning_rate= learning_rate,
                                           n_estimators= int(n_estimators),
                                           reg_alpha = reg_alpha,
                                           nthread = -1,
                                           objective='binary:logistic',
                                         )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict(x_valid)
        
        score += f1_score(pred,y_valid, average='macro')/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [32]:
from functools import partial  
from bayes_opt import BayesianOptimization


func_fixed = partial(xgb_cv, x_data=train_x, y_data=train_y, n_splits=5, output='score') 

lgbBO = BayesianOptimization(
    func_fixed, 
    {
    'max_depth': (5,100),
    'learning_rate': (0, 0.001),
    'n_estimators' : (1,500),
    'reg_alpha': (0,1)
    }, 
    random_state=4321                    # 시드 고정
)
lgbBO.maximize(init_points=10, n_iter=30) # 처음 10회 랜덤 score 계산 후 30회 

|   iter    |  target   | learni... | max_depth | n_esti... | reg_alpha |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.913    [0m | [0m7.08e-06 [0m | [0m82.43    [0m | [0m384.2    [0m | [0m0.2864   [0m |
| [0m2        [0m | [0m0.9128   [0m | [0m1.931e-05[0m | [0m98.0     [0m | [0m203.7    [0m | [0m0.7578   [0m |
| [95m3        [0m | [95m0.9208   [0m | [95m8.915e-06[0m | [95m34.44    [0m | [95m309.8    [0m | [95m0.4599   [0m |
| [0m4        [0m | [0m0.9128   [0m | [0m2.183e-05[0m | [0m68.03    [0m | [0m339.7    [0m | [0m0.9503   [0m |
| [0m5        [0m | [0m0.913    [0m | [0m2.813e-05[0m | [0m63.89    [0m | [0m192.3    [0m | [0m0.4004   [0m |
| [0m6        [0m | [0m0.9199   [0m | [0m9.427e-05[0m | [0m93.34    [0m | [0m474.2    [0m | [0m0.3755   [0m |
| [0m7        [0m | [0m0.9208   [0m | [0m3.423e-05[0m | [0m68.15    [0m | [0m22.12    [0m | [0m0.2322  

# 3.catBoost

In [77]:
from sklearn.model_selection import KFold
from catboost import Pool, cv, CatBoostClassifier

from sklearn.metrics import f1_score
def cat_cv(n_estimators, depth, learning_rate, max_bin,
              num_leaves, l2_leaf_reg, model_size_reg,  output='score'):
    score = 0
    kf = KFold(n_splits=5)
    models = []
    for train_index, valid_index in kf.split(train_x):
        x_train, y_train = train_x.iloc[train_index], train_y[train_index]
        x_valid, y_valid = train_x.iloc[valid_index], train_y[valid_index]
        
        model = CatBoostClassifier(
                            n_estimators = int(n_estimators),
                            learning_rate = learning_rate,
                            l2_leaf_reg = l2_leaf_reg,
                            max_depth = int(depth),
                            num_leaves = int(num_leaves),
                            random_state = 88,
                            grow_policy = "Lossguide",
                            #task_type="GPU",
                            max_bin = int(max_bin),  
                            model_size_reg = model_size_reg,
                            
                            )
        
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict(x_valid)
        
        score += f1_score(pred,y_valid, average='macro')/5
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [78]:
from functools import partial  
from bayes_opt import BayesianOptimization

pbounds = {"n_estimators": (10,1000),
           "depth": (2,15),
           "learning_rate": (.001, 1),
           "num_leaves": (1,40),
           "max_bin":(1,300),
           "l2_leaf_reg":(0,10),
           "model_size_reg": (0,10)
}
optimizer = BayesianOptimization(
    f = cat_cv,
    pbounds = pbounds,
    verbose = 2,
    random_state = 888
)
optimizer.maximize(init_points=3, n_iter=30) 

|   iter    |  target   |   depth   | l2_lea... | learni... |  max_bin  | model_... | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
0:	learn: 0.7082600	total: 1.88ms	remaining: 122ms
1:	learn: 0.4969028	total: 3.87ms	remaining: 124ms
2:	learn: 0.3554457	total: 5.67ms	remaining: 119ms
3:	learn: 0.2831851	total: 7.79ms	remaining: 121ms
4:	learn: 0.2278593	total: 9.28ms	remaining: 113ms
5:	learn: 0.1870586	total: 10.5ms	remaining: 105ms
6:	learn: 0.1551446	total: 11.7ms	remaining: 98.9ms
7:	learn: 0.1355106	total: 13ms	remaining: 93.9ms
8:	learn: 0.1149228	total: 14ms	remaining: 88.9ms
9:	learn: 0.1035263	total: 15.1ms	remaining: 84.6ms
10:	learn: 0.0893953	total: 16.3ms	remaining: 81.4ms
11:	learn: 0.0813827	total: 17.2ms	remaining: 77.6ms
12:	learn: 0.0744992	total: 18.3ms	remaining: 74.7ms
13:	learn: 0.0692238	total: 19.2ms	remaining: 71.5ms
14:	learn: 0.0649146	total: 20.3ms	remaining: 69ms
15:	lea

In [79]:
optimizer.max

{'target': 0.9651203791990541,
 'params': {'depth': 14.125960987190359,
  'l2_leaf_reg': 3.34251629303533,
  'learning_rate': 0.10945208281572452,
  'max_bin': 291.9920003682243,
  'model_size_reg': 4.257909146774925,
  'n_estimators': 46.577536813828516,
  'num_leaves': 34.88455908498228}}

In [81]:
model = CatBoostClassifier(
                    n_estimators = int(46.577536813828516),
                    learning_rate = 0.10945208281572452,
                    l2_leaf_reg = 3.34251629303533,
                    max_depth = int(14.125960987190359),
                    num_leaves = int(34.88455908498228),
                    random_state = 88,
                    grow_policy = "Lossguide",
                    max_bin = int(291.9920003682243),  
                    model_size_reg = 4.257909146774925,
                    
                    )

model.fit(train_x, train_y)

preds = model.predict(test_x)


0:	learn: 1.0010772	total: 1.97ms	remaining: 88.8ms
1:	learn: 0.9219123	total: 4.11ms	remaining: 90.4ms
2:	learn: 0.8521836	total: 5.92ms	remaining: 84.9ms
3:	learn: 0.7882303	total: 7.54ms	remaining: 79.1ms
4:	learn: 0.7411181	total: 8.97ms	remaining: 73.5ms
5:	learn: 0.6959303	total: 9.95ms	remaining: 66.4ms
6:	learn: 0.6480931	total: 11.1ms	remaining: 62ms
7:	learn: 0.6058874	total: 12.4ms	remaining: 58.9ms
8:	learn: 0.5672329	total: 13.6ms	remaining: 55.7ms
9:	learn: 0.5337363	total: 14.6ms	remaining: 52.7ms
10:	learn: 0.5075843	total: 15.8ms	remaining: 50.2ms
11:	learn: 0.4802351	total: 16.8ms	remaining: 47.5ms
12:	learn: 0.4569825	total: 17.9ms	remaining: 45.4ms
13:	learn: 0.4287341	total: 19ms	remaining: 43.4ms
14:	learn: 0.4093922	total: 20ms	remaining: 41.4ms
15:	learn: 0.3866366	total: 21.1ms	remaining: 39.5ms
16:	learn: 0.3666579	total: 22.1ms	remaining: 37.7ms
17:	learn: 0.3532794	total: 23.1ms	remaining: 35.9ms
18:	learn: 0.3345933	total: 24.2ms	remaining: 34.4ms
19:	learn

# 4. Stacking

In [102]:
from sklearn.ensemble import  StackingClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb  
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

def cat_cv(xg_max_depth,xg_learning_rate,xg_n_estimators,xg_reg_alpha,
           lgb_num_leaves, lgb_learning_rate, lgb_n_estimators, lgb_subsample, lgb_colsample_bytree, lgb_reg_alpha, lgb_reg_lambda,
           cat_n_estimators, cat_depth, cat_learning_rate, cat_max_bin, cat_num_leaves, cat_l2_leaf_reg, cat_model_size_reg, 
                                                                                output='score'):
    score = 0
    kf = KFold(n_splits=5)
    models = []
    for train_index, valid_index in kf.split(train_x):
        x_train, y_train = train_x.iloc[train_index], train_y[train_index]
        x_valid, y_valid = train_x.iloc[valid_index], train_y[valid_index]
        
        
        base_models = [('rf_1', xgb.XGBClassifier(max_depth=int(xg_max_depth),
                                           learning_rate= xg_learning_rate,
                                           n_estimators= int(xg_n_estimators),
                                           reg_alpha = xg_reg_alpha,
                                           nthread = -1,
                                           objective='binary:logistic',)),
                                           
                        ('rf_2', lgb.LGBMClassifier(
                                num_leaves = int(lgb_num_leaves), 
                                learning_rate = lgb_learning_rate, 
                                n_estimators = int(lgb_n_estimators), 
                                subsample = np.clip(lgb_subsample, 0, 1), 
                                colsample_bytree = np.clip(lgb_colsample_bytree, 0, 1), 
                                reg_alpha = lgb_reg_alpha, 
                                reg_lambda = lgb_reg_lambda
                         )) ]

        # stacking 설정
        model = StackingClassifier(estimators=base_models, final_estimator=CatBoostClassifier(n_estimators = int(cat_n_estimators),
                                                        learning_rate = cat_learning_rate,
                                                        l2_leaf_reg = cat_l2_leaf_reg,
                                                        max_depth = int(cat_depth),
                                                        num_leaves = int(cat_num_leaves),
                                                        random_state = 88,
                                                        grow_policy = "Lossguide",
                                                        max_bin = int(cat_max_bin),  
                                                        logging_level='Silent',
                                                        model_size_reg = cat_model_size_reg))
                                    
        model.fit(x_train, y_train)
        models.append(model)
        
        pred = model.predict(x_valid)
        
        score += f1_score(pred,y_valid, average='macro')/5
    
    if output == 'score':
        return score
    if output == 'model':
        return models



# 베이지안 최적화 범위 설정
pbounds = {"cat_n_estimators": (10,1000),
           "cat_depth": (2,15),
           "cat_learning_rate": (.001, 1),
           "cat_num_leaves": (1,40),
           "cat_max_bin":(1,300),
           "cat_l2_leaf_reg":(0,10),
           "cat_model_size_reg": (0,10),

           'lgb_num_leaves': (16, 1024),        
            'lgb_learning_rate': (0.0001, 0.1), 
            'lgb_n_estimators': (16, 1024),     
            'lgb_subsample': (0, 1),             
            'lgb_colsample_bytree': (0, 1),     
            'lgb_reg_alpha': (0, 10),            
            'lgb_reg_lambda': (0, 50),           

             'xg_max_depth': (5,100),
            'xg_learning_rate': (0, 0.001),
            'xg_n_estimators' : (1,500),
            'xg_reg_alpha': (0,1)
            
}
optimizer = BayesianOptimization(
    f = cat_cv,
    pbounds = pbounds,
    verbose = 2,
    random_state = 888
)
optimizer.maximize(init_points=2, n_iter=30) # 처음 5회 랜덤 값으로 score 계산 후 30회 최적화

|   iter    |  target   | cat_depth | cat_l2... | cat_le... | cat_ma... | cat_mo... | cat_n_... | cat_nu... | lgb_co... | lgb_le... | lgb_n_... | lgb_nu... | lgb_re... | lgb_re... | lgb_su... | xg_lea... | xg_max... | xg_n_e... | xg_reg... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9368   [0m | [0m13.17    [0m | [0m1.646    [0m | [0m0.484    [0m | [0m276.4    [0m | [0m4.286    [0m | [0m66.89    [0m | [0m37.08    [0m | [0m0.6576   [0m | [0m0.01338  [0m | [0m553.7    [0m | [0m922.7    [0m | [0m2.484    [0m | [0m1.509    [0m | [0m0.07245  [0m | [0m0.0008742[0m | [0m58.05    [0m | [0m458.1    [0m | [0m0.6335   [0m |


## Inference

## Submission

In [82]:
submit = pd.read_csv('./sample_submission.csv')

In [83]:
submit['class'] = class_le.inverse_transform(preds)

  return f(**kwargs)


In [84]:
submit.to_csv('./submit.csv', index=False)