In [1]:
import numpy as np
import pandas as pd
import os
import lightgbm
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
path = "E:/data/tabular-playground-series-may-2022/"

In [2]:
df_train = pd.read_csv(os.path.join(path, "train.csv"))
df_test = pd.read_csv(os.path.join(path, "test.csv"))
df_train.head(5)

Unnamed: 0,id,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,...,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,target
0,0,-1.373246,0.238887,-0.243376,0.567405,-0.647715,0.839326,0.113133,1,5,...,-2.540739,0.766952,-2.730628,-0.208177,1.363402,ABABDADBAB,67.609153,0,0,0
1,1,1.697021,-1.710322,-2.230332,-0.545661,1.113173,-1.552175,0.447825,1,3,...,2.278315,-0.633658,-1.217077,-3.782194,-0.058316,ACACCADCEB,377.096415,0,0,1
2,2,1.681726,0.616746,-1.027689,0.810492,-0.609086,0.113965,-0.70866,1,0,...,-1.385775,-0.520558,-0.009121,2.788536,-3.703488,AAAEABCKAD,-195.599702,0,2,1
3,3,-0.118172,-0.587835,-0.804638,2.086822,0.371005,-0.128831,-0.282575,3,2,...,0.572594,-1.653213,1.686035,-2.533098,-0.608601,BDBBAACBCB,210.826205,0,0,1
4,4,1.148481,-0.176567,-0.664871,-1.101343,0.467875,0.500117,0.407515,3,3,...,-3.912929,-1.430366,2.127649,-3.306784,4.371371,BDBCBBCHFE,-217.211798,0,1,1


In [3]:
df_x_train_raw = df_train.drop(["id", "target"], axis=1)
df_y_train = df_train["target"]
df_x_test_raw = df_test.drop("id", axis=1)

def ordinizer(df):
    list_ord = []
    for idx, row in df["f_27"].iteritems():
        list_temp = []
        for char in row:
            list_temp.append(ord(char) - 65)
        list_ord.append(list_temp)
    df_return = pd.DataFrame(list_ord, columns = ["f_27_" + str(x) for x in range(len(list_ord[0]))])
    df_return = pd.concat([df, df_return], axis=1).drop("f_27", axis=1)
    return df_return

df_x_train = ordinizer(df_x_train_raw)
df_x_test = ordinizer(df_x_test_raw)

In [7]:
TAG_MIN = df_x_train.min()
TAG_MAX = df_x_train.max()

def normalize(df):
    ndf = df.copy()
    for c in df.columns:
        if TAG_MIN[c] == TAG_MAX[c]:
            ndf[c] = df[c] - TAG_MIN[c]
        else:
            ndf[c] = (df[c] - TAG_MIN[c]) / (TAG_MAX[c] - TAG_MIN[c])
    return ndf
df_x_train_norm = normalize(df_x_train)
df_x_test_norm = normalize(df_x_test)

### define objective function

In [8]:
def objectiveLGBM(trial: Trial, X, y):
    param = {
        'num_leaves' : trial.suggest_int('num_leaves', 4, 64),
        'max_depth' : trial.suggest_int('max_depth', 1, 16),
        'n_estimators' : trial.suggest_int('n_estimators', 1, 300),
        'learning_rate' : 0.01,
        # 'tree_method' : 'gpu_hist',
        # 'predictor' : 'gpu_predictor',
        'reg_lambda ' : trial.suggest_loguniform('reg_lambda ', 1e-3, 1.0),
        'reg_alpha ' : trial.suggest_loguniform('reg_alpha ', 1e-3, 1.0),
        'subsample' : trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0]),
        'random_state' : 6756
    }
    
    # 학습 모델 생성
    model = lightgbm.LGBMClassifier(**param)
    model_fitted = model.fit(X, y, verbose=False) # 학습 진행
    
    # 모델 성능 확인
    score = mean_absolute_error(model_fitted.predict(X), y)
    
    return score

In [None]:
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial : objectiveLGBM(trial, df_x_train_norm, df_y_train), n_trials = 100)
print('Best trial : score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

In [10]:
params = study.best_trial.params

In [11]:
lgb = lightgbm.LGBMClassifier(**params)
lgb.fit(X=df_x_train_norm.to_numpy(), y=df_y_train.to_numpy())
pred = lgb.predict(X=df_x_test_norm)
pred



array([1, 1, 0, ..., 0, 0, 0], dtype=int64)

In [None]:
df_sub = pd.DataFrame([df_test["id"], pred]).T
df_sub.columns = ["id", "target"]
df_sub.to_csv("submission.csv", index=False)