# imports & variables

In [None]:
# basic
import numpy as np 
import pandas as pd
import scipy as sp
import sys,os,math,time,random,shutil

# split 
from sklearn.model_selection import train_test_split

# model
import lightgbm as lgb
import optuna 
import optuna.integration.lightgbm as lgbo

# oof
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy import stats

# data path 
TRAIN_PATH = "../input/tabular-playground-series-aug-2021/train.csv"
TEST_PATH = "../input/tabular-playground-series-aug-2021/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/tabular-playground-series-aug-2021/sample_submission.csv"
SUBMISSION_PATH = "submission.csv"

# main columns
ID  ="id"
TARGET = "loss"

#seed 
SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything()

# model
NFOLD = 5
TEST_SIZE = 0.25
OBJECTIVE = 'mean_squared_error'
METRICS = 'rmse'
NBR = 10
VERBOSE_EVAL = 100

# build & predict 

In [None]:
# load
train = pd.read_csv(TRAIN_PATH)

# split (input & target)
y= train[TARGET]
X = train.drop([ID,TARGET],axis=1)

#split (train & validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=SEED)

# search best param (optuna)
params = { 'objective':OBJECTIVE, 'metric':METRICS }

lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_val, y_val)
model = lgbo.train(params, lgb_train, valid_sets=[lgb_valid], verbose_eval=False, num_boost_round=NBR) 
print(model.params)

# oof prediction
test = pd.read_csv(TEST_PATH)
X_test = test.drop([ID],axis=1)

kf = KFold(n_splits=NFOLD, random_state=SEED, shuffle=True)
pred_test_list =[]
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print("Fold :", fold+1)
    
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    X_val,  y_val = X.loc[val_idx], y.loc[val_idx]
 
    D_train = lgb.Dataset(X_train, y_train)
    D_valid = lgb.Dataset(X_val, y_val)
 
    model = lgb.train(model.params,D_train, valid_sets=[D_valid], verbose_eval=VERBOSE_EVAL)
    pred_val = model.predict(X_val)

    pred_test = model.predict(X_test)
    pred_test_list.append(pred_test.tolist())
    
    print('#### fold #########',np.sqrt(mean_squared_error(y_val, pred_val)),mean_squared_error(y_val, pred_val))

# mak submission csv  

In [None]:
modeResult = stats.mode(pred_test_list, axis=0)
final_test_pred = modeResult.mode

sub = pd.read_csv(SAMPLE_SUBMISSION_PATH)
sub[TARGET] = final_test_pred[0]
sub.to_csv(SUBMISSION_PATH,index=False)
sub.head()