### [Competition Link](https://www.kaggle.com/c/tabular-playground-series-aug-2021)

# Import Libraries

In [None]:
# Import Libraries
from pathlib import Path
import numpy as np 
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn import ensemble
from sklearn import model_selection
import xgboost as xgb
import lightgbm as lgb

# Read Data

In [None]:
path = Path('/kaggle/input/tabular-playground-series-aug-2021')

In [None]:
list(path.iterdir())

In [None]:
train = pd.read_csv(f'{path}/train.csv')
test = pd.read_csv(f'{path}/test.csv')
sample = pd.read_csv(f'{path}/sample_submission.csv')

# Utility Code

In [None]:
def rmse(y, pred): return round(np.sqrt(metrics.mean_squared_error(y, pred)), 6)

In [None]:
def mfe(model, xtrain, ytrain, xval, yval):
    model.fit(xtrain, ytrain)
    preds_train = model.predict(xtrain)
    preds_val = model.predict(xval)
    print(f'RMSE Train: {rmse(ytrain, preds_train)} RMSE Valid: {rmse(yval, preds_val)}')

In [None]:
def submit(model, data, fname=None):
    preds = model.predict(data.loc[:, feats])
    df_preds = pd.DataFrame({'id': data.id.values, 'loss': preds})
    df_preds.to_csv(fname, index=False)
    return 'Predictions exported to csv'

# EDA

In [None]:
train.head(3)

In [None]:
train.info()

In [None]:
train.isnull().sum().any()

# Data Splitting

In [None]:
# Set up data set for splitting
key = ['id']
feats = [col for col in train.columns if col.startswith('f')]; len(feats)
target = ['loss']
X = train.loc[:, feats]
y = train.loc[:, target].values.flatten()
print(X.shape, y.shape)

In [None]:
# Prepare train and validation dataset
xtrain, xval, ytrain, yval = model_selection.train_test_split(X, y, test_size=.25, random_state=42, shuffle=True)

In [None]:
xtrain.shape, xval.shape, ytrain.shape, yval.shape

In [None]:
# Compelete dataset to be used to training before submission.
x_all =  train.loc[:, feats]
y_all = train.loc[:, target].values.flatten()

# Random Forest

In [None]:
rf = ensemble.RandomForestRegressor(
    n_estimators=40, 
    max_depth=8, 
    min_samples_split=2, 
    min_samples_leaf=5, 
    max_features='auto', 
    max_leaf_nodes=None, 
    max_samples=None, 
    n_jobs=-1, 
    random_state=42)

In [None]:
# %%time
# mfe(rf, xtrain, ytrain, xval, yval)

In [None]:
#submit(rf, test, fname='sub1.csv')

# XGBoost

## Native API

In [None]:
# Define the train, validation and test Dmatrix objects.
dtrain = xgb.DMatrix(xtrain, label=ytrain)
dval = xgb.DMatrix(xval, label=yval)
dtest = xgb.DMatrix(test[feats])

In [None]:
# Define the model params.
params = {
    'eta': 0.05,
    'gamma': 1,
    'max_depth':6 ,
    'min_child_weight': 8,
    'subsample': 1,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.8,
    'colsample_bynode': 1, 
    'lambda': 1,
    'alpha': 1,
    'tree_method': 'exact',
    'objective': 'reg:squarederror',
    'eval_metric':'rmse',
    'seed': 42
} 

In [None]:
%%time
m = xgb.train(
    params, 
    dtrain, 
    num_boost_round = 1000, 
    evals = [(dtrain, 'train'), (dval, 'val')], # List of validation sets for which metrics will evaluated during training
    early_stopping_rounds = 30,
    verbose_eval = 50
)

In [None]:
'''
%%time
m2 = xgb.train(
    params, 
    dtrain, 
    num_boost_round = 345, 
    evals = [(dtrain, 'train'), (dval, 'val')], # List of validation sets for which metrics will evaluated during training
    early_stopping_rounds = 30,
    verbose_eval = 50
)
'''

In [None]:
# submit predictions
preds = m.predict(dtest)
df_preds = pd.DataFrame({'id': test.id.values, 'loss': preds})
df_preds.to_csv('xgb2.csv', index=False)

## Scikit Learn API

In [None]:
m3 = xgb.XGBRegressor(
    n_estimators = 100,
    learning_rate = 0.1,
    max_depth = 6,
    gamma = 0,
    min_child_weight = 7,
    max_delta_step = 0,
    subsample = 1,
    colsample_bytree = 1,
    colsample_bylevel = 1,
    reg_alpha = 1,
    reg_lambda = 1,
    objective = 'reg:squarederror',
    n_jobs = -1,
    random_state = 42)

In [None]:
'''
m3.fit(
    xtrain, ytrain,
    eval_set = [(xtrain, ytrain), (xval, yval)],
    eval_metric = 'rmse',
    early_stopping_rounds = 10,
    verbose = 10)
'''

# LightGBM