# Step 1: Import helpful libraries

In [1]:
# Importing Libraries
import pandas as pd # Intermediate DS
import numpy as np # Scientific Operations

import optuna # For Hyper Parameter Tuning
from functools import partial
import multiprocessing

from sklearn import preprocessing # Preprocesing library for Encoding, etc.
from sklearn.model_selection import train_test_split # For splitting train and test data
from sklearn.metrics import mean_squared_error # For Calculating Mean Squared Error
import lightgbm as lgb # Light GBM Regressor

import warnings
warnings.filterwarnings('ignore')

# Step 2: Load the data

Next, we'll load the training and test data.  

In [1]:
# Load the training data
train = pd.read_csv("input/train.csv", index_col=0)
test = pd.read_csv("input/test.csv", index_col=0)

# Preview the data
train.head()
test.head()

The next section removes target from train data and creates the target variable

In [1]:
y_train = train.target
X_train = train.drop(['target'], axis=1)
X_test = test.copy()

# Preview features
X_train.head()

The next section helps to identify the categorical columns and treat those specific columns by using Ordinal Encoder

In [1]:
# Extract the Categorical Columns
cat_cols = [feature for feature in train.columns if 'cat' in feature]
print(cat_cols)

# Copy of original data to prevent overwwritting them
label_X_train = X_train.copy()
label_X_test = X_test.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = preprocessing.OrdinalEncoder()
label_X_train[cat_cols] = ordinal_encoder.fit_transform(label_X_train[cat_cols])
label_X_test[cat_cols] = ordinal_encoder.transform(label_X_test[cat_cols])

In [1]:
label_X_train

In [1]:
label_X_test

Use Optuna to perform hyper parameter tuning

In [1]:
def objective(trial, X, y):
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2)
    dtrain = lgb.Dataset(train_x, label=train_y, free_raw_data=False)
    dvalid = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False)

    param = {
        "metric": "rmse",
        "verbosity": -1,
        'max_depth':trial.suggest_int('max_depth', 5, 50),
        'n_estimators':trial.suggest_int("n_estimators", 1000, 50000, step=100),
        'subsample': trial.suggest_uniform('subsample', 0.2, 1.0),
        'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.2, 1.0),
        'learning_rate':trial.suggest_uniform('learning_rate', 0.001, 0.01),
        'reg_lambda':trial.suggest_uniform('reg_lambda', 0.01, 50),
        'reg_alpha':trial.suggest_uniform('reg_alpha', 0.01, 50),
        'min_child_samples':trial.suggest_int('min_child_samples', 5, 100),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'max_bin':trial.suggest_int('max_bin', 30, 1000),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        'learning_rate':trial.suggest_uniform('learning_rate', 0.001, 0.01),
        'cat_smooth':trial.suggest_int('cat_smooth', 5, 100),
        'cat_l2':trial.suggest_loguniform('cat_l2', 1e-3, 100),
        'num_threads': multiprocessing.cpu_count()-2,
        #'device': 'gpu',
        #'gpu_platform_id': 0,
        #'gpu_device_id': 0
    }

    lightgbm_model = lgb.train(param, dtrain, valid_sets=(dtrain, dvalid), early_stopping_rounds=250)
    
    train_score = np.round(mean_squared_error(train_y, lightgbm_model.predict(train_x), squared=False), 5)
    test_score = np.round(mean_squared_error(valid_y, lightgbm_model.predict(valid_x), squared=False), 5)
    
    print(f'TRAIN RMSE : {train_score} || TEST RMSE : {test_score}')
    
    return test_score

In [1]:
print(multiprocessing.cpu_count())

In [1]:
optimize = partial(objective, X=label_X_train, y=y_train)
study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(optimize, n_trials=100)


print("Number of finished trials: {}".format(len(study_lgbm.trials)))

print("Best trial:")
trial = study_lgbm.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))