In this notebook, you will learn how to make your first submission to the [Tabular Playground Series - Feb 2021 competition.](http://https://www.kaggle.com/c/tabular-playground-series-feb-2021)

# Make the most of this notebook!

You can use the "Copy and Edit" button in the upper right of the page to create your own copy of this notebook and experiment with different models. You can run it as is and then see if you can make improvements.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

## We need to encode the categoricals.

There are different strategies to accomplish this, and different approaches will have different performance when using different algorithms. For this starter notebook, we'll use simple encoding.

In [None]:
for c in train.columns:
    if train[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(train[c].values)
        test[c] = lbl.transform(test[c].values)
        
display(train.head())

## Pull out the target, and make a validation split

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.60)

# How well can we do with a completely naive model?

We'll want any of our models to do (hopefully much!) better than this.

In [None]:
# Let's get a benchmark score
model_dummy = DummyRegressor(strategy='median')
model_dummy.fit(X_train, y_train)
y_dummy = model_dummy.predict(X_test)
score_dummy = mean_squared_error(y_test, y_dummy, squared=False)
print(f'{score_dummy:0.5f}')

# Simple Linear Regression

A simple linear regression doesn't do better than our dummy regressor! (Alghouth, simple categorical encoding really doesn't make sense for this approach!)

In [None]:
# Simple Linear Regression
model_simple_linear = LinearRegression(fit_intercept=False) # data is not centered, don't fit intercept
model_simple_linear.fit(X_train, y_train)
y_simple_linear = model_simple_linear.predict(X_test)
score_simple_linear = mean_squared_error(y_test, y_simple_linear, squared=False)
print(f'{score_simple_linear:0.5f}')

# This seems slow and repetative. Can we automate it a bit?

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

In [None]:
#help(RandomForestRegressor)

In [None]:
model_names = ["Dummy Median", "Linear",  "Lasso", "Random Forest"]

models = [
    DummyRegressor(strategy='median'),
    LinearRegression(fit_intercept=False),
    Lasso(fit_intercept=False),
    RandomForestRegressor(n_estimators=1000, n_jobs=-1,
                         max_depth=2)]

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

# It look like RandomForest did the best. Let's train it on all the data and make a submission!

Hyperparameter Study

In [None]:
from sklearn.metrics import f1_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from sklearn.datasets import make_classification
from functools import partial
from xgboost import XGBRegressor
import math
import plotly.express as px

In [None]:
#### A function to calculate errors ####
def root_mean_squared_log_error(y_valid, y_preds):
    """Calculate root mean squared error of log(y_true) and log(y_pred)"""
    if len(y_preds)!=len(y_valid): return 'error_mismatch'
    y_preds_new = [math.log(x) for x in y_preds]
    y_valid_new = [math.log(x) for x in y_valid]
    return mean_squared_error(y_valid_new, y_preds_new, squared=False)
#########################################

In [None]:
space = [
#    Real(0.9, 1.0, name="colsample_bylevel"),
#    Real(0.4, 0.6, name="colsample_bytree"),
#    Real(0.0, 0.0001, name="gamma"),
    Real(0.001, 0.03, name="learning_rate"),
#    Real(0., 0.1, name="max_delta_step"),
    Integer(0, 5, name="max_depth"), ## Influencer parameter
#    Real(0, 0.1, name="min_child_weight"),
    Integer(1200, 1400, name="n_estimators"),
#    Real(0., 0.001, name="reg_alpha"),
#    Real(0.1, 1, name="reg_lambda"),
#    Real(0.79, 0.8, name="subsample"),
#    Real(0.4, 0.5, name="base_score"),
]

#########
#########
# function to fit the model and return the performance of the model
def return_model_assessment(args, X_train, y_train, X_test, X_valid):
    global models, train_scores, test_scores, curr_model_hyper_params
    params = {curr_model_hyper_params[i]: args[i] for i, j in enumerate(curr_model_hyper_params)}
    #print('Mira los params: ',params)
    model = XGBRegressor(random_state=0,
#                          gpu_id=-1,
#                          booster='gbtree', 
#                          monotone_constraints='()',
                          n_jobs=0, num_parallel_tree=1,
                          colsample_bytree=0.6,
                          max_depth=2,
                          learning_rate=0.02, max_delta_step=0,
                          subsample=0.8,
#                          scale_pos_weight=1,
#                          tree_method='exact', validate_parameters=1,
                          verbosity=None)                    
    model.set_params(**params)
    #model = Pipeline(steps=[('feature_preprocessor', feature_preprocessor2),('feature_model', model1)])
    fitted_model = model.fit(X_train, y_train)
    models.append(fitted_model)
    train_predictions = model.predict(X_test)
    #test_predictions = model.predict(X_test)
    train_score = root_mean_squared_log_error(y_test, train_predictions)
    #test_score = root_mean_squared_log_error(feature_y_valid2, test_predictions)
    train_scores.append(train_score)
    #test_scores.append(test_score)
    return train_score

In [None]:
# collecting the fitted models and model performance
models = []
train_scores = []
test_scores = []
curr_model_hyper_params = ['learning_rate','max_depth','n_estimators']  
#    'colsample_bylevel', 'colsample_bytree',
#                           'gamma', 'learning_rate', 'max_delta_step',
#                           'max_depth', 'min_child_weight', 
#                           'n_estimators', 'reg_alpha', 'reg_lambda',
#                           'subsample','base_score']

objective_function = partial(return_model_assessment,
                             X_train=X_train, 
                             y_train=y_train, 
                             X_test=X_test, 
                             X_valid=X_test)

# running the algorithm
n_calls = 20 # number of times you want to train your model
results = gp_minimize(objective_function, space, base_estimator=None,
                      n_calls=n_calls,n_random_starts=n_calls-1,random_state=0)

In [None]:
metrics = pd.DataFrame(train_scores)
metrics.loc[:,'dataset'] = ["train_score"]*n_calls
metrics.loc[:,'Iteration Number'] = list(range(1,n_calls+1))
metrics.columns = ["MSLE", "dataset", "Iteration Number"]
fig = px.line(metrics, x="Iteration Number", y="MSLE", color="dataset")
fig.show()

In [None]:
# Extract the best model, based in the mean_squared_log_error
IndexMin=train_scores.index(min(train_scores))
# The model -
bestModel=models[IndexMin]

We write the file for submission here

In [None]:
submission['target'] = bestModel.predict(test)
submission.to_csv('random_forest.csv')

## Now you should save your Notebook (blue button in the upper right), and then when that's complete go to the notebook viewer and make a submission to the competition. :-)

## There's lots of room for improvement. What things can you try to get a better score?