# **Preparing the environment**

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor


        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

## Pull out the target, and make a validation split

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.65)

## Encoding the categoricals

As we are not sure whether the categorical variables are ordinal or not, the safer approach will be to use one-hot encoding.

In [None]:
obj_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']

ohe = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
dummies_X_train = pd.DataFrame(ohe.fit_transform(X_train[obj_columns]))
dummies_X_test = pd.DataFrame(ohe.transform(X_test[obj_columns]))
dummies_test = pd.DataFrame(ohe.transform(test[obj_columns]))

dummies_X_train.index = X_train.index
dummies_X_test.index = X_test.index
dummies_test.index = test.index

num_X_train = X_train.drop(obj_columns, axis = 1)
num_X_test = X_test.drop(obj_columns, axis = 1)
num_test = test.drop(obj_columns, axis = 1)

X_train = pd.concat([num_X_train, dummies_X_train], axis = 1)
X_test = pd.concat([num_X_test, dummies_X_test], axis = 1)
test = pd.concat([num_test, dummies_test], axis = 1)

# How well can we do with a completely naive model?

We'll want any of our models to do (hopefully much!) better than this.

In [None]:
# Let's get a benchmark score
model_dummy = DummyRegressor(strategy='median')
model_dummy.fit(X_train, y_train)
y_dummy = model_dummy.predict(X_test)
score_dummy = mean_squared_error(y_test, y_dummy, squared=False)
print(f'{score_dummy:0.5f}')

# Simple Linear Regression

A simple linear regression doesn't do better than our dummy regressor! (Alghouth, simple categorical encoding really doesn't make sense for this approach!)

In [None]:
# Simple Linear Regression
model_simple_linear = LinearRegression(fit_intercept=True) # data is not centered, we need an intercept!
model_simple_linear.fit(X_train, y_train)
y_simple_linear = model_simple_linear.predict(X_test)
score_simple_linear = mean_squared_error(y_test, y_simple_linear, squared=False)
print(f'{score_simple_linear:0.5f}')

# This seems slow and repetative. Can we automate it a bit?

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

In [None]:
model_names = ["Dummy Median", "Lasso", "Random Forest", "XGBoost"]

models = [
    DummyRegressor(strategy='median'),
    Lasso(fit_intercept=True),
    RandomForestRegressor(n_estimators=50, n_jobs=-1),
    xgb.XGBRegressor(objective = "reg:linear", learning_rate = 0.1, max_depth = 10, n_estimators = 50)]

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

# It looks like XGBoost performs the best (surprise surprise:)). Now it is time to tweak some of the parameters to further decrease the error and test the algorithm on a cross-validated data set.

First trying to assess potential ranges of parameters to grid-search for later on by tweaking parameters one-by-one.

In [None]:
model = xgb.XGBRegressor()

#model.fit(train, target)
#submission['target'] = model.predict(test)
#submission.to_csv('random_forest.csv')

learning_rates = [0.003, 0.01, 0.03, 0.1, 0.3]

for rate in learning_rates:
    xgb_lrate = xgb.XGBRegressor(objective = 'reg:squarederror', learning_rate = rate)
    xgb_lrate.fit(X_train, y_train)
    y_pred = xgb_lrate.predict(X_test)
    score = mean_squared_error(y_test, y_pred, squared=False)
    print(f"Learning rate: {rate}, score: {score}")
    
# Based on the results I can reduce the learning rates span to [0.03, 0.1, 0.3]
    
ns_estimators = [30, 50, 100, 300, 1000]

for no in ns_estimators:
    xgb_lrate = xgb.XGBRegressor(objective = 'reg:squarederror', n_estimators = no)
    xgb_lrate.fit(X_train, y_train)
    y_pred = xgb_lrate.predict(X_test)
    score = mean_squared_error(y_test, y_pred, squared=False)
    print(f"Number of estimators: {no}, score: {score}")
    
# Based on the results I will reduce the number of estimators span to [30, 50, 100]

Using the ranges of parameters I found suitable for further analysis, I will run grid search for optimal combination of parameters to use to train the model.

In [None]:
parameters = {'objective':['reg:squarederror'],
              'learning_rate': [0.03, 0.1, 0.3],
              'max_depth': [5, 6, 7],
              'min_child_weight': [3],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.8],
              'n_estimators': [30, 50, 100]}

xgboost = GridSearchCV(model, parameters, n_jobs=3, scoring='neg_root_mean_squared_error', 
                       verbose=2, refit=True)

xgboost.fit(X_train, y_train)

# Now I can fit the XGBoost with optimal parameters to predict values and verify its accuracy

In [None]:
y_pred_final = xgboost.predict(X_test)
score_final = mean_squared_error(y_test, y_pred_final, squared=False)

print(f"The optimal parameters are: {best_parameters}. \nThey yield following RMSE value: {score_final}.")

plot_results("Final XGBoost", y_test, y_pred_final)

## As you can see, I have not gained much improvement by grid-searching for optimal parameters for my XGBoost algorithm. Having considered differences in running time between default XGBoost and XGBoost with grid-searched parameters and their corresponding RMSE values, use of grid-search in this case is arguable.

In [None]:
submission['target'] = xgboost.predict(test)
submission.to_csv('random_forest.csv')