In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import lightgbm as lgb
import optuna.integration.lightgbm as oplgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/train.csv")
df_test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv")
df_sample = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv")

In [None]:
train_id = df_train["id"]
test_id = df_test["id"]

df_train.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

In [None]:
feature_cols = [c for c in df_train.columns if c != "target"]

In [None]:
train_x = df_train[feature_cols]
train_y = df_train.target
test_x = df_test

In [None]:
folds = KFold(n_splits=10, shuffle=True, random_state=2021)

In [None]:
class FoldsAverageLGBM:
    def __init__(self, folds):
        self.folds = folds
        self.models = []
        
    def fit(self, lgb_params, train_x, train_y):
        oof_preds = np.zeros_like(train_y)
        
        self.train_x = train_x.values
        self.train_y = train_y.values
        
        for tr_idx, va_idx in tqdm(folds.split(train_x)):
            tr_x, va_x = self.train_x[tr_idx], self.train_x[va_idx]
            tr_y, va_y = self.train_y[tr_idx], self.train_y[va_idx]
            
            lgb_train_dataset = lgb.Dataset(tr_x, tr_y)
            lgb_valid_dataset = lgb.Dataset(va_x, va_y)
            model = lgb.train(lgb_params, lgb_train_dataset, valid_sets=[lgb_valid_dataset], verbose_eval=100)
            self.models.append(model)
            
            oof_pred = model.predict(va_x)
            oof_preds[va_idx] = oof_pred
            
        self.oof_preds = oof_preds
        
    def predict(self, test_x):
        preds = []
        for model in tqdm(self.models):
            pred = model.predict(test_x)
            preds.append(pred)
        preds = np.mean(preds, axis=0)
        return preds

In [None]:
best_lgb_params = {
 'seed': 2021,
 'objective': 'regression',
 'metric': 'rmse',
 'verbosity': -1,
 'feature_pre_filter': False,
 'lambda_l1': 6.540486456085813,
 'lambda_l2': 0.01548480538099245,
 'num_leaves': 256,
 'feature_fraction': 0.52,
 'bagging_fraction': 0.6161835249194311,
 'bagging_freq': 7,
 'min_child_samples': 20
}
best_lgb_params["learning_rate"] = 0.001
best_lgb_params["early_stopping_round"] = 1000
best_lgb_params["num_iterations"] = 20000

In [None]:
folds_average_lgbm = FoldsAverageLGBM(folds)

In [None]:
folds_average_lgbm.fit(best_lgb_params, train_x, train_y)

In [None]:
np.sqrt(mean_squared_error(df_train.target, folds_average_lgbm.oof_preds))

In [None]:
y_pred = folds_average_lgbm.predict(test_x)

In [None]:
sub = df_sample.copy()
sub["target"] = y_pred

sub.to_csv("submission_lgbm_fold_10.csv", index=False)

sub.head()