# Simple LightGBM + Optuna integration

Optuna has a LightGBM integration for quick parameter tuning.

This notebook provides you a good starting point using a simple LGBM module and a baseline workspace around the classifier.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# importing modules

In [None]:
# import lightgbm as lgb
import optuna.integration.lightgbm as lgb

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import optuna

# Reading Data

In [None]:
df = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-jun-2021/test.csv')

# Defining model as a function

This makes much easier to tune hyperparameters or to run a stacking / blending later on

In [None]:
def exec_lgb_model(X_train, X_test, y_train, y_test, params):
    model = lgb_model(X_train, y_train, params)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    return log_loss(y_test, y_pred), model, params
def lgb_optuna_model(X_train, X_test, y_train, y_test, params):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    model = lgb.train(params,
                      lgb_train,
                      valid_sets=lgb_eval,
                      num_boost_round=200,
                      early_stopping_rounds=20)
    return model

# Defining preprocessing as a funciton

In [None]:
def preprocess_X(df, include_id = False):
    X = df[['feature_'+str(x) for x in range(1, 75)]]
    if include_id:
        X['id'] = df['id']
    return X

def preprocess_data(df, drop_tgt = False):
    df[['_','tgt']] = df['target'].str.split('_', expand=True)
    if drop_tgt:
        df=df.drop(['tgt','_'], axis=1)
    X = preprocess_X(df)
    y = df['tgt'].astype(int)-1
    X
    return X, y

In [None]:
X, y = preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 114514)

In [None]:
lgbm_params = {'objective': 'multiclass',
 'metric': 'multi_logloss',
 'num_class': 9,
 'feature_pre_filter': False,
 'lambda_l1': 9.970496731080852,
 'lambda_l2': 0.33605433486440883,
 'num_leaves': 9,
 'feature_fraction': 0.4,
 'bagging_fraction': 0.8180268235748993,
 'bagging_freq': 5,
 'min_child_samples': 20,
 'num_iterations': 100,
 'early_stopping_round': 10}
model = lgb_optuna_model(X_train, X_test, y_train, y_test, lgbm_params)

In [None]:
X_val = preprocess_X(df_test)
y_pred = model.predict(X_val, num_iteration=model.best_iteration)
prediction = pd.DataFrame(y_pred, columns=['Class_'+str(x) for x in range(1, 10)])
prediction['id'] = df_test['id']
prediction

In [None]:
prediction.to_csv('/kaggle/working/submission.csv', index=False)