In [None]:
import numpy as np 
import pandas as pd 

In [None]:
train_data = pd.read_csv('../input/tabular-playground-series-jan-2021/train.csv')
test_data  = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')

print('Train: ', train_data.shape)
print('Test: ', test_data.shape)

In [None]:
y = train_data['target']
X = train_data.drop(columns=['target', 'id'])
X_test = test_data.drop(columns='id')

## Part 1 - ordinary XGB ;)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.05, random_state=22)

## XGB regressor

In [None]:
# from xgboost import XGBRegressor

# xgb = XGBRegressor(random_state=22
#                   , n_estimators=3000
#                   , early_stopping_rounds=10
#                   , learning_rate=0.05
#                   , subsample=0.9
#                   , colsample_bytree=0.9
#                   , n_jobs=-1)

# xgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=1)

# predictions = xgb.predict(X_test)

## Part 2 - CV and lightGBM

In [None]:
# thanks to:
# https://www.kaggle.com/hamditarek/tabular-playground-series-xgboost-lightgbm

from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from lightgbm import LGBMRegressor

params = {'objective': 'regression',
 'metric': 'rmse',
 'verbosity': -1,
 'boosting_type': 'gbdt',
 'feature_pre_filter': False,
 'learning_rate': 0.007,
 'num_leaves': 102,
 'min_child_samples': 20,
 'sub_feature': 0.4,
 'sub_row': 1,
 'subsample_freq': 0,
 'lambda_l1': 4.6,
 'lambda_l2': 1.9}


N_FOLDS = 10

kf = KFold(n_splits = N_FOLDS)
oof = np.zeros(len(y))
oof_vanilla = np.zeros(len(y))
preds = np.zeros(len(X_test))
params['learning_rate'] = 0.005
params['num_iterations'] = 5000

for train_ind, test_ind in tqdm(kf.split(X)):
    Xtrain = X.iloc[train_ind]
    Xval = X.iloc[test_ind]
    ytrain = y.iloc[train_ind]
    yval = y.iloc[test_ind]

    model = LGBMRegressor(**params)
    vanilla_model = LGBMRegressor()
    
    model.fit(Xtrain, ytrain, eval_set = ((Xval,yval)), early_stopping_rounds = 50, verbose = 0)
    vanilla_model.fit(Xtrain, ytrain)
    p = model.predict(Xval)
    p_vanilla = vanilla_model.predict(Xval)
    oof[test_ind] = p
    oof_vanilla[test_ind] = p_vanilla
    
    preds += model.predict(X_test)/N_FOLDS
    
print(f'mean square error on training data (vanilla model): {np.round(mean_squared_error(y, oof_vanilla, squared=False),5)}')    
print(f'mean square error on training data (with tuning): {np.round(mean_squared_error(y, oof, squared=False),5)}')

## Creating submission file

In [None]:
output = pd.DataFrame({"id":test_data.id, "target":preds})
output.to_csv('submission.csv', index=False)

In [None]:
print('Finish!')