In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import model_selection  # Stratified KFold
from sklearn import metrics  # for metrics on Regression Data

import xgboost as xgb

In [None]:
train = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
train.head()

In [None]:
train.info()

In [None]:
test = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")
test.head()

In [None]:
test.info()

### The dataset has __No Null Values!!__

## Create Folds

In [None]:
def create_folds(data):
    
    # We create a new column called kfold and fill it with -1
    data["kfold"] = -1.0
    
    # randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)
    
    # Calculate the number of bins using Sturges's law
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["loss"], bins=num_bins, labels=False
    )
    
    kf = model_selection.StratifiedKFold(n_splits=5)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins
    for fold, (train_, val_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[val_, 'kfold'] = fold
        
    # drop the bins column
    data = data.drop("bins", axis=1)
    
    return data

df = create_folds(train)
df.to_csv("kfold_train.csv", index=False)

## Models

In [None]:
def runLR(df, fold):
    """
    Calculates R2 score for Linear Regression model
    
    :param df: the training data frame
    :param fold: fold on which evaluation will be performed
    """
    
    # Training and Validation data frames
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    
    # Training data
    x_train = df_train.drop(["loss", "kfold", "id"], axis=1).values
    y_train = df_train["loss"].values
    
    # Validation data
    x_valid = df_valid.drop(["loss", "kfold", "id"], axis=1).values
    y_valid = df_valid["loss"].values
    
    model = xgb.XGBRegressor(n_estimators = 300, max_depth=5)
    model.fit(x_train,y_train)
    
    y_pred_train = model.predict(x_train)
    y_pred_valid = model.predict(x_valid)
    
    mse_train = metrics.mean_squared_error(y_train, y_pred_train)
    mse_valid = metrics.mean_squared_error(y_valid, y_pred_valid)
    
    print(f"RMSE Score (Fold : {fold}) :: Train : {mse_train**0.5} and Valid : {mse_valid**0.5}")

In [None]:
data = pd.read_csv("./kfold_train.csv")
for f_ in range(5):
    runLR(data, f_)

## Training

In [None]:
X = data.drop(["loss", "kfold", "id"], axis=1).values
Y = data["loss"].values

model = xgb.XGBRegressor(n_estimators = 300, max_depth=5)
model.fit(X, Y)

In [None]:
XTest = test.drop(["id"], axis=1).values
XTest.shape, X.shape

In [None]:
YPred = model.predict(XTest)

## Submission

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")
sub.head()

In [None]:
sub["loss"] = YPred
sub.to_csv("submission.csv", index=False)

In [None]:
sub.head()