This notebook, is copied from [Tabular Playground Series - Feb 2021 competition.](http://https://www.kaggle.com/c/tabular-playground-series-feb-2021)

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import math

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBRegressor
from xgboost import XGBRFRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import GridSearchCV
        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())

In [None]:
display(train.describe())
display(train.info())

In [None]:
sns.set(rc={'figure.figsize':(30,20)})
sns.heatmap(train.corr(), annot=True, annot_kws={'size':12})

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

In [None]:
display(test.describe())
display(test.info())

# One Hot Encoding Training Data
'cat6' is not encoded using the get_dummies because there are missing categories in the train dataset. It will be later encoded using the LabelEncoder

In [None]:
dummy_columns = ['cat0', 'cat1','cat2','cat3','cat4','cat5','cat7','cat8','cat9']
train_df = pd.get_dummies(train, columns=dummy_columns, drop_first=True)
display(train_df.head())
display(train_df.info())

In [None]:
test_df = pd.get_dummies(test, columns=dummy_columns, drop_first=True)
display(test_df.head())
display(test_df.info())

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

# Check if there are missing columns

In [None]:
list(set(train_df.columns) - set(test_df.columns))

# Encode the Categorical variables with missing values using the LabelEncoder

In [None]:
categorical_columns = ['cat6']
#categorical_columns = train_df.columns
for c in categorical_columns:
    if train_df[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train_df[c].values) + list(test_df[c].values))
        train_df[c] = lbl.transform(train_df[c].values)
        test_df[c] = lbl.transform(test_df[c].values)
        
display(train_df.head())

# Pull out the target, and make a validation split

In [None]:
target = train_df.pop('target')

## Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df, target, train_size=0.70)

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

In [None]:
#param_grid = {
#    'n_estimators':[500, 700, 1000],
#    'max_depth':[3,4,5],
#    'learning_rate':[0.1, 0.01, 0.05]
#}
#model = GridSearchCV(XGBRegressor(), param_grid, cv=7)
#model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
#display(model.best_params_)

In [None]:
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', colsample_bytree = 0.5, subsample = 0.9, max_depth = 5, gamma=0.25, reg_lambda = 10, scale_pos_weight=3, alpha = 10, learning_rate=0.1)
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.01, max_depth = 3)#0.85697
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.1, max_depth = 3)#0.84937
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.7, max_depth = 3)#0.85186
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 5)#0.84638

#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3)#0.84478
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.5) #0.84509
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3)#0.84427
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, gamma=0.5)#0.84475
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, gamma=1)#0.84452
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, gamma=10)#0.84604
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, gamma=0.25)#0.84427

#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, subsample=0.9)#0.84473
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, subsample=0.5)#0.84629
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, subsample=0.25)#0.84737
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, subsample=0.75)#0.84530

#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=5) #0.84422
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=10)#0.84406
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30)#0.84327
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=60)#0.84329

#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30, gamma=30)#0.85107
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30, gamma=0.5)#0.84357

#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 10, colsample_bytree = 0.3, alpha=30)#0.84614
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 6, colsample_bytree = 0.3, alpha=30)#0.84417

#model = XGBRegressor(n_estimators=1000, objective ='reg:squaredlogerror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30)#0.86158
#model = XGBRegressor(n_estimators=1000, objective ='reg:pseudohubererror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30)#85.78727

#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=30) #0.84375
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=10) #0.84408
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=60) #0.84357
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=100) #0.84333
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=1000) #0.84296

#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.3, max_depth = 6, colsample_bytree = 0.3, alpha=30, reg_lambda=1000) #0.84340
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.1, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=1000) #0.84294
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.25, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=1000) #0.84290

#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.25, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=1000, scale_pos_weight=10) #0.84290
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.25, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=1000, scale_pos_weight=100) #0.84290
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.25, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=1000, subsample=0.5) #0.84369

model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.25, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=1000, subsample=0.25) #0.84472

model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)

y_pred = model.predict(X_test)
plot_results('XGBRegressor', y_test, y_pred)

In [None]:
#model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', colsample_bytree = 0.3, max_depth = 5, alpha = 10, learning_rate=0.1)

model = XGBRegressor(n_estimators=1000, objective ='reg:squarederror', learning_rate=0.25, max_depth = 3, colsample_bytree = 0.3, alpha=30, reg_lambda=1000, subsample=0.25) #0.84290
model.fit(train_df, target, early_stopping_rounds=5, eval_set=[(X_test, y_test)], verbose=False)
#model.fit(train_df, target)
submission['target'] = model.predict(test_df)
submission.to_csv('xgb.csv')