In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, ParameterGrid, cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import optuna
from sklearn.metrics import make_scorer

In [3]:
!pip install optuna

Collecting optuna
  Downloading optuna-2.5.0-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 5.1 MB/s eta 0:00:01
Collecting cmaes>=0.6.0
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting alembic
  Downloading alembic-1.5.5.tar.gz (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 19.6 MB/s eta 0:00:01
[?25hCollecting cliff
  Downloading cliff-3.7.0-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 10.2 MB/s eta 0:00:01
Collecting colorlog
  Downloading colorlog-4.7.2-py2.py3-none-any.whl (10 kB)
Collecting Mako
  Downloading Mako-1.1.4.tar.gz (479 kB)
[K     |████████████████████████████████| 479 kB 28.1 MB/s eta 0:00:01
[?25hCollecting python-editor>=0.3
  Downloading python_editor-1.0.4-py3-none-any.whl (4.9 kB)
Collecting stevedore>=2.0.1
  Downloading stevedore-3.3.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 8.6 MB/s  eta 0:00:01
[?25hCollecting pbr!=2.1.0,>=2.0.0
  Download

In [5]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [6]:
def Encode(df):
    le = LabelEncoder() 
    for column in df.select_dtypes('object'):
        le.fit(df[column])
        df[column]=le.transform(df[column])
    
    scaler = StandardScaler()
    for column in df.select_dtypes('float64'):
        scaler.fit(df[column].values.reshape(-1,1))
        df[column] = scaler.transform(df[column].values.reshape(-1,1))
         
    return df

In [12]:
#ハイパーパラメーターチューニング＋交差検証

def objective(trial):
    X_train = df_train.drop('target', axis=1)
    y_train = df_train['target']
    
    X_train = Encode(X_train)
    
    lgb_params = {
        #fixed
        'learning_rate':trial.suggest_float("learning_rate", 1e-2, 1e-1, log=True),
        'n_estimators': 500,
        'metric':'l2',
        'max_depth':5,
        'boosting_type':'gbdt'

#         #variable
#         'num_leaves': trial.suggest_int('num_leaves', 10, 300),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha',0.001, 10),
#         'reg_lambda':trial.suggest_loguniform('reg_lambda', 0.001, 10),

    }
    
    clf = lgb.LGBMRegressor(**lgb_params)
    kf = KFold(n_splits=5, random_state=0, shuffle=True)
   
    def RMSE(y_pred, y_true):
        rmse = np.sqrt(mean_squared_error(y_pred,y_true))
        
        return rmse 
        
    some_funcs ={
        'RMSE':make_scorer(RMSE)
    }
    
    scores = cross_validate(clf, X=X_train, y=y_train, cv=kf, scoring=some_funcs)
    
    return scores['test_RMSE'].mean()

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=3)
lgb_params = study.best_params

# 実行結果表示
print('最終トライアル回数:{}'.format(len(study.trials)))
print('ベストトライアル:')
trial = study.best_trial
print('値:{}'.format(trial.value))
print('パラメータ:')
for key, value in trial.params.items():
    print('{}:{}'.format(key, value))

[32m[I 2021-02-28 14:17:39,255][0m A new study created in memory with name: no-name-d37a98a2-67a1-42dd-9c9c-fb1a40191577[0m
[32m[I 2021-02-28 14:18:19,626][0m Trial 0 finished with value: 0.8445927522492465 and parameters: {'learning_rate': 0.09145012564474751}. Best is trial 0 with value: 0.8445927522492465.[0m
[32m[I 2021-02-28 14:19:03,146][0m Trial 1 finished with value: 0.846292205269016 and parameters: {'learning_rate': 0.029826804435367643}. Best is trial 0 with value: 0.8445927522492465.[0m
[32m[I 2021-02-28 14:19:46,267][0m Trial 2 finished with value: 0.8470767875253757 and parameters: {'learning_rate': 0.025475819256792768}. Best is trial 0 with value: 0.8445927522492465.[0m


最終トライアル回数:3
ベストトライアル:
値:0.8445927522492465
パラメータ:
learning_rate:0.09145012564474751


In [15]:
X_train = df_train.drop('target', axis=1)
y_train = df_train['target']
X_train = Encode(X_train)
X_test = Encode(df_test)

clf = lgb.LGBMRegressor(**lgb_params)
clf.fit(X_train, y_train)
pred_test = clf.predict(X_test)
df_sample = pd.read_csv('sample_submission.csv')
df_sample['target'] = pred_test
df_sample.to_csv('submmit_lgb.csv', index=False)