In [47]:
import pandas as pd
import random
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

In [5]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.5-py3-none-any.whl (348 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.5/348.5 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.8.2
  Downloading cmaes-0.9.0-py3-none-any.whl (23 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.9.1-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.4/210.4 KB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-metadata<5.0.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.

In [6]:
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [41]:
!unzip -qq "/content/gdrive/MyDrive/유전체 정보 품종 분류 AI 경진대회.zip"

In [42]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [43]:
def get_x_y(df):
    if 'class' in df.columns:
        df_x = df.drop(columns=['id', 'class','father','mother','gender'])
        df_y = df['class']
        return df_x, df_y
    else:
        df_x = df.drop(columns=['id','father','mother','gender'])
        return df_x

In [44]:
train_x, train_y = get_x_y(train)
test_x = get_x_y(test)

In [45]:

train_y

0      B
1      C
2      B
3      A
4      C
      ..
257    B
258    C
259    A
260    A
261    B
Name: class, Length: 262, dtype: object

In [48]:
class_le = preprocessing.LabelEncoder()
snp_le = preprocessing.LabelEncoder()
snp_col = [f'SNP_{str(x).zfill(2)}' for x in range(1,16)]

In [49]:
snp_data = []
for col in snp_col:
    snp_data += list(train_x[col].values)

In [50]:
train_y = class_le.fit_transform(train_y)
snp_le.fit(snp_data)

LabelEncoder()

In [51]:
for col in train_x.columns:
    if col in snp_col:
        train_x[col] = snp_le.transform(train_x[col])
        test_x[col] = snp_le.transform(test_x[col])

In [53]:
from sklearn.metrics import f1_score

In [56]:
def objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "objective": "multiclass",
        "metric": "multi_logloss",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
#학습모델 형성
    model = LGBMClassifier(**params_lgb)
#학습 진행
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=False,
    )

    lgb_pred = model.predict_proba(X_valid).argmax(axis=1)
# 해당 대회의 평가 지표가 Logloss이므로 이렇게 코드를 짬
# 만약 이게 회귀 대회고 평가 지표가 mse라면 
#score = mean_absolute_error(xgb_model.predict(X), y)
#이런식으로 짜니까 상황에 따라 평가지표 부분을 변경해줘야함.
    log_score = f1_score(y_valid, lgb_pred,average='micro')
    
    return log_score

In [57]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
# n_trials 지정해주지 않으면, 무한 반복
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2023-01-04 10:00:17,388][0m A new study created in memory with name: lgbm_parameter_opt[0m
[32m[I 2023-01-04 10:00:36,681][0m Trial 0 finished with value: 0.7063492063492064 and parameters: {'reg_alpha': 1.12424581642324e-05, 'reg_lambda': 0.08556428806974939, 'max_depth': 15, 'num_leaves': 154, 'colsample_bytree': 0.4936111842654619, 'subsample': 0.40919616423534183, 'subsample_freq': 1, 'min_child_samples': 88, 'max_bin': 380}. Best is trial 0 with value: 0.7063492063492064.[0m
[32m[I 2023-01-04 10:00:52,942][0m Trial 1 finished with value: 0.7137188208616779 and parameters: {'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 20, 'num_leaves': 214, 'colsample_bytree': 0.5274034664069657, 'subsample': 0.42727747704497043, 'subsample_freq': 2, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 0 with value: 0.7063492063492064.[0m
[32m[I 2023-01-04 10:01:07,243][0m Trial 2 finished with value: 0.7008692365835223 and parameters: 

Best Score: 0.6878306878306878
Best trial: {'reg_alpha': 1.5431890808024213e-05, 'reg_lambda': 0.05331731527343814, 'max_depth': 1, 'num_leaves': 156, 'colsample_bytree': 0.502314474212375, 'subsample': 0.3455361150896956, 'subsample_freq': 10, 'min_child_samples': 97, 'max_bin': 443}


In [58]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
# n_trials 지정해주지 않으면, 무한 반복
study.optimize(objective, n_trials=20)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2023-01-04 10:02:59,737][0m A new study created in memory with name: lgbm_parameter_opt[0m
[32m[I 2023-01-04 10:03:15,492][0m Trial 0 finished with value: 0.7142857142857143 and parameters: {'reg_alpha': 1.12424581642324e-05, 'reg_lambda': 0.08556428806974939, 'max_depth': 15, 'num_leaves': 154, 'colsample_bytree': 0.4936111842654619, 'subsample': 0.40919616423534183, 'subsample_freq': 1, 'min_child_samples': 88, 'max_bin': 380}. Best is trial 0 with value: 0.7142857142857143.[0m
[32m[I 2023-01-04 10:03:27,790][0m Trial 1 finished with value: 0.7152305366591081 and parameters: {'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 20, 'num_leaves': 214, 'colsample_bytree': 0.5274034664069657, 'subsample': 0.42727747704497043, 'subsample_freq': 2, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 0 with value: 0.7142857142857143.[0m
[32m[I 2023-01-04 10:03:42,740][0m Trial 2 finished with value: 0.7152305366591081 and parameters: 

Best Score: 0.685374149659864
Best trial: {'reg_alpha': 1.5431890808024213e-05, 'reg_lambda': 0.05331731527343814, 'max_depth': 1, 'num_leaves': 156, 'colsample_bytree': 0.502314474212375, 'subsample': 0.3455361150896956, 'subsample_freq': 10, 'min_child_samples': 97, 'max_bin': 443}


In [60]:
print(study.best_trial.params) 

{'reg_alpha': 1.5431890808024213e-05, 'reg_lambda': 0.05331731527343814, 'max_depth': 1, 'num_leaves': 156, 'colsample_bytree': 0.502314474212375, 'subsample': 0.3455361150896956, 'subsample_freq': 10, 'min_child_samples': 97, 'max_bin': 443}


In [61]:
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(reg_alpha=1.5431890808024213e-05,reg_lambda=0.05331731527343814,max_depth=1,num_leaves=156,colsample_bytree=0.502314474212375,subsample=0.3455361150896956,subsample_freq=10,min_child_samples=97,max_bin=443)

lgbm_clf.fit(train_x, train_y)

LGBMClassifier(colsample_bytree=0.502314474212375, max_bin=443, max_depth=1,
               min_child_samples=97, num_leaves=156,
               reg_alpha=1.5431890808024213e-05, reg_lambda=0.05331731527343814,
               subsample=0.3455361150896956, subsample_freq=10)

In [64]:
train_x

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,2,5,1,0,4,2,0,0,5,0,5,1,0,0,0,0
1,2,1,1,2,0,0,1,0,4,0,1,0,4,5,0,0
2,2,5,5,0,4,3,5,0,4,4,1,0,0,0,0,0
3,1,0,5,0,4,0,5,5,0,5,1,5,5,5,0,5
4,2,5,5,3,0,3,0,0,0,0,5,0,0,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,2,1,1,0,4,3,1,0,4,0,5,1,4,0,0,0
258,2,5,0,2,0,0,1,4,4,0,1,1,0,1,0,4
259,1,1,5,0,4,0,1,5,4,4,0,5,5,5,2,5
260,1,0,5,0,4,0,5,5,0,4,1,1,4,5,2,5


In [63]:
test_x

Unnamed: 0,trait,SNP_01,SNP_02,SNP_03,SNP_04,SNP_05,SNP_06,SNP_07,SNP_08,SNP_09,SNP_10,SNP_11,SNP_12,SNP_13,SNP_14,SNP_15
0,1,1,5,0,4,0,1,5,4,4,1,1,4,5,2,4
1,2,5,1,3,5,3,0,0,0,0,5,1,0,0,0,0
2,2,5,1,0,0,2,1,0,0,0,1,0,4,5,0,5
3,2,5,1,2,0,3,0,0,0,0,5,0,4,1,0,0
4,1,0,5,0,5,0,5,5,0,5,1,5,4,5,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170,2,1,5,3,0,2,1,0,5,0,5,5,0,0,0,4
171,2,5,0,0,0,2,1,0,0,0,1,0,0,1,0,4
172,2,5,0,0,0,2,1,0,0,0,5,1,0,1,0,5
173,2,1,5,2,4,3,5,0,4,0,5,1,0,0,0,0


In [66]:
preds = lgbm_clf.predict(test_x)
print('Done.')

Done.


In [73]:
submit = pd.read_csv('/content/sample_submission.csv')

In [75]:
preds

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [76]:
pred = pd.DataFrame(preds)

In [77]:
submit

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,A
2,TEST_002,A
3,TEST_003,A
4,TEST_004,A
...,...,...
170,TEST_170,A
171,TEST_171,A
172,TEST_172,A
173,TEST_173,A


In [70]:
pred

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
170,1
171,1
172,1
173,1


In [78]:
submit

Unnamed: 0,id,class
0,TEST_000,A
1,TEST_001,A
2,TEST_002,A
3,TEST_003,A
4,TEST_004,A
...,...,...
170,TEST_170,A
171,TEST_171,A
172,TEST_172,A
173,TEST_173,A


즉 평가지표에 따라서 direction하는 부분이 다릅니다. 예를들어 logloss는 최소화 시켜주는 방향으로 하이퍼파라미터튜닝을 해야 좋은 모델이므로 direction="minimize"로 설정해주면 됩니다. 반면 accuracy라든지 roc-auc 같은 경우는 최대화 시켜주는 방향으로 하이퍼파라미터튜닝을 해야 하므로 direction="maximize"로 설정해주는 것이 맞습니다. 다른 대회도 평가 지표를 잘 이해해서 코드를 활용하시면 될 것 같습니다.

In [20]:
optuna.visualization.plot_optimization_history(study)

In [16]:
# 파라미터들관의 관계
optuna.visualization.plot_parallel_coordinate(study)

In [17]:
# 각 파라미터들의 상관관계
optuna.visualization.plot_contour(
    study,
    params=[
        "max_depth",
        "num_leaves",
        "colsample_bytree",
        "subsample",
        "subsample_freq",
        "min_child_samples",
        "max_bin",
    ],
)

In [18]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(study)