In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Essential Libraries and Datasets

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import cudf
import cupy

In [None]:
path = '/kaggle/input/tabular-playground-series-feb-2021/'
train = pd.read_csv(path + 'train.csv')
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns})

test = pd.read_csv(path + 'test.csv')
test = test.astype({c: np.float32 for c in test.select_dtypes(include='float64').columns})

sample_submission = pd.read_csv(path + 'sample_submission.csv')

# Data Preprocessing

In [None]:
pd.set_option("display.max_columns", 30)
train.head()

In [None]:
pd.set_option("display.max_columns", 30)
test.head()

In [None]:
train.shape, test.shape

In [None]:
# Since both Train and Test datasets have similar data types, we can check any of the one !!
train.dtypes

In [None]:
train.describe()

In [None]:
train.cat0.unique()

In [None]:
from sklearn.preprocessing import OneHotEncoder
oh = OneHotEncoder(handle_unknown='ignore')

cat_cols = train.select_dtypes(exclude=['int', 'float', 'float32']).columns
ohenc = pd.DataFrame(oh.fit_transform(train[cat_cols]).toarray())
ohenctest = pd.DataFrame(oh.transform(test[cat_cols]).toarray())

train.drop(cat_cols, axis=1, inplace=True)
train = pd.concat([train, ohenc], axis=1)


test.drop(cat_cols, axis=1, inplace=True)
test = pd.concat([test, ohenctest], axis=1)

In [None]:
train.head()

In [None]:
#from cuml.preprocessing.TargetEncoder import TargetEncoder

#SMOOTH = 0.001
#SPLIT = 'interleaved'
#FOLDS = 5

#encoder = TargetEncoder(n_folds=FOLDS, smooth=SMOOTH, split_method=SPLIT)
#cat_cols = X_train.select_dtypes(exclude = ['int', 'float32']).columns


#for col in cat_cols:
#        X_train[col] = encoder.fit_transform(X_train[col], y_train)
#        test[col] = encoder.transform(test[col])

In [None]:
X_train = train.drop(columns = ['target', 'id'], axis = 1)

y_train = train['target']

X_test = test.drop('id', axis = 1)

In [None]:
X_train.head()

In [None]:
y_train.head()

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=2)

In [None]:
X_train = pd.DataFrame(X_train)
X_train = X_train.astype({c: np.float32 for c in X_train.select_dtypes(include='float64').columns})

# Model Building

In [None]:
from xgboost import XGBRegressor
from cuml.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# XGBoost
xgb = XGBRegressor(tree_method='gpu_hist')
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_valid)
print("score : ", mean_squared_error(pred_xgb, y_valid, squared = False))

In [None]:
# Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
#print("score : ", mean_squared_error(pred_rf, y_valid, squared = False))

In [None]:
# LGBM
lgbm = LGBMRegressor(tree_method='gpu_hist', n_estimators=4000, learning_rate=0.01, max_depth=4)
lgbm.fit(X_train, y_train)
pred_lgbm = lgbm.predict(X_test)
# print("score : ", np.sqrt(mean_squared_error(pred, y_valid)))

In [None]:
def objective(trial, X = X_train, y = y_train):
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                test_size = 0.2, random_state = 0)
    params = {
        'device_type' : 'gpu',
        'gpu_use_dp' : 'true',
        'learning_rate' : trial.suggest_categorical("learning_rate",
            [0.008, 0.009, 0.01, 0.012, 0.014, 0.016 ,0.018, 0.02]),
        'n_estimators' : trial.suggest_int("n_estimators", 100, 1500),
        'max_depth' : trial.suggest_int("max_depth", 2, 20),
        'num_leaves' : trial.suggest_int("num_leaves", 2, 200),
        #'max_features' : trial.suggest_uniform("max_features", 0.01, 1.0),
        'random_state' : 0,
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300)
    }
    
    lgbm = LGBMRegressor(**params)
    lgbm.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False)
    
    pred = lgbm.predict(X_val)
    
    rmse = mean_squared_error(y_val, pred, squared = False)
    
    return rmse

In [None]:
import optuna
study = optuna.create_study(direction = 'minimize')
study.optimize(objective, n_trials = 100)
print ('Total Finished Trials:', len(study.trials))
print('Best Trial:', study.best_trial.params)

In [None]:
Best_trial = {'learning_rate': 0.018, 'n_estimators': 1450, 'max_depth': 14, 'num_leaves': 72, 'lambda': 0.0024644531179561766, 
              'alpha': 1.863601675701509, 'colsample_bytree': 0.3, 'subsample': 0.8, 'min_child_weight': 276}

lgbm = LGBMRegressor(**Best_trial)
lgbm.fit(X_train, y_train)
pred_lgbm = lgbm.predict(X_test)

In [None]:
sample_submission['target'] = pred_lgbm
sample_submission.to_csv('submission9_LGBM_Using_Optuna_BestTrial2.csv', index = False)

In [None]:
#Best Trial No 1: {'learning_rate': 0.02, 'n_estimators': 1447, 'max_depth': 13, 'num_leaves': 20, 'lambda': 0.002497286679888304, 
#             'alpha': 0.0395223334673374, 'colsample_bytree': 0.3, 'subsample': 0.5, 'min_child_weight': 62}


# Best_trial No 2 = {'learning_rate': 0.018, 'n_estimators': 1450, 'max_depth': 14, 'num_leaves': 72, 'lambda': 0.0024644531179561766, 
#              'alpha': 1.863601675701509, 'colsample_bytree': 0.3, 'subsample': 0.8, 'min_child_weight': 276}