# Contents:
* 1. See data files
* 2. Make a Validation Split
* 3. Decide Best Model
* 4. Tuning Gradient Boosting Model
* 5. Tuning Random Forest Model
* 6. Comparison of Tuned and Default Models
* 7. Predict and Input Data based on Tuned-Gradient Boosting Model

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
     
# import necessary modules
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error # import MSE to calculate RMSE use sqrt() or set squared=False
        
input_path = Path('/kaggle/input/tabular-playground-series-jan-2021/')

# 1. See data files
check contents of: train.csv, test.csv, sample_submission.csv

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
train.head()

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
test.head()

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
submission.head()

# 2. Make a Validation Split
Split train and test data from train.csv data.

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
target = train.pop('target') # pull out target columnm from train DataFrame, target is Series
(x_train, x_test, y_train, y_test) = train_test_split(train, target, test_size=0.2, random_state=0) # make train and test set from train.csv
# just check split sucessfully
print(x_train)
print()
print(y_train)

# 3. Decide Best Model
Machine learning list:
1. Dummy regression
2. Lasso regression
3. Ridge regression
4. ElasticNet regression
5. Decision Tree regression
6. AdaBoost regression
7. GradientBoosting regression
8. Random Forest regression

In [None]:
# import machine laerning API
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
# add 2 methods, version 4 onwards
from sklearn.ensemble import BaggingRegressor
from sklearn.neural_network import MLPRegressor

def plot_models(name, y, y_model, lims=(0,12), figsize=(8,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, y_model, squared=False)
    plt.scatter(y, y_model)
    plt.plot(lims, lims, color='black', linestyle='solid', linewidth=1.0)
    plt.xlim(lims)
    plt.ylim(lims)
    plt.title(f'{name}: {score:.5f}', fontsize=18)
    plt.show()
    
#model_names = ['DummyRegressor', 'Lasso', 'Ridge', 'ElasticNet',
#               'DecisionTreeRegressor', 'AdaBoost', 'GradientBoostingRegressor', 'RandomForest']
model_names = ['DummyRegressor', 'Lasso', 'Ridge', 'ElasticNet',
               'DecisionTreeRegressor', 'AdaBoost', 'GradientBoostingRegressor', 'RandomForest', 'Bagging', 'MLPRegressor']
#models = [DummyRegressor(), Lasso(), Ridge(), ElasticNet(), 
#          DecisionTreeRegressor(random_state=0), AdaBoostRegressor(random_state=0), 
#          GradientBoostingRegressor(random_state=0), RandomForestRegressor(n_jobs=-1, random_state=0)]
models = [DummyRegressor(), Lasso(), Ridge(), ElasticNet(), 
          DecisionTreeRegressor(random_state=0), AdaBoostRegressor(random_state=0), 
          GradientBoostingRegressor(random_state=0), RandomForestRegressor(n_jobs=-1, random_state=0), 
          BaggingRegressor(n_jobs=-1, random_state=0), MLPRegressor(random_state=0)]

for name, model in zip(model_names, models):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    plot_models(name, y_test, y_pred)
    

The result is following:

1. Dummy regression: 0.73239
2. Lasso regression: 0.73239
3. Ridge regression: 0.72552
4. ElasticNet regression: 0.73239
5. Decision Tree regression: 1.01322
6. AdaBoost regression: 0.72655
7. GradientBoosting regression: 0.71100
8. Random Forest regression: 0.70831
9. Bagging regression: 0.74023
10. MLP Regression: 0.71572

Since the less RMSE value has, the less difference between observed value and predicted value, i.e. Better Model. So, it seems Ramdom Forest regression is the best machine learning method. But it take much time to create model and predict the value. So let's try tuning GradientBoosting model at first, because the RMSE value is slightly different from that of Random Forest. Moreover, since some paramaters are same as Random Forest, those parameters can be used as Random Forest paramter tuning.

# 4. Tuning Gradient Boosting Model

> # 1. Define parameter "loss" and "max_features"

In [None]:
RMSE = []
loss_list = ["ls", "lad", "huber", "quantile"]
for loss in loss_list:
    model = GradientBoostingRegressor(loss=loss, random_state=0)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(loss_list, RMSE, marker="o")
plt.xlabel('loss_list')
plt.ylabel('RMSE value')
plt.title('loss_list vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

Best parameter is loss="huber".

In [None]:
RMSE = []
max_features_list = ["auto", "sqrt", "log2"]
for max_features in max_features_list:
    model = GradientBoostingRegressor(loss="huber", max_features=max_features, random_state=0)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(max_features_list, RMSE, marker="o")
plt.xlabel('max_features_list')
plt.ylabel('RMSE value')
plt.title('max_features vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

Best parameter is max_features="auto".

> # 2. Define paramter "max_depth"
On condition that loss="huber" and max_features="auto"

In [None]:
RMSE = []
max_depth_list = [i for i in range(3, 12, 2)]
for max_depth in max_depth_list:
    model = GradientBoostingRegressor(loss="huber", max_depth=max_depth, max_features="auto", random_state=0)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(max_depth_list, RMSE, marker="o")
plt.xlabel('max_depth')
plt.ylabel('RMSE value')
plt.title('max_depth vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

It looks like RMSE is minimized where 7<=max_depth<=9, so let's search minimum value by running code again as 7<=max_depth<=9.

In [None]:
RMSE = []
max_depth_list = [i for i in range(7, 10)]
for max_depth in max_depth_list:
    model = GradientBoostingRegressor(loss="huber", max_depth=max_depth, max_features="auto", random_state=0)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(max_depth_list, RMSE, marker="o")
plt.xlabel('max_depth')
plt.ylabel('RMSE value')
plt.title('max_depth vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

The graph says that **max_depth=8** is best parameter. When max_depth=8 is set, RMSE is minimized.

> # 3. Define paramter "n_estimators"
On condition that loss="huber", max_features="auto" and max_depth=8

In [None]:
RMSE = []
n_estimators_list = [i for i in range(95,121,5)]
for n_estimators in n_estimators_list:
    model = GradientBoostingRegressor(loss="huber", max_depth=8, n_estimators=n_estimators, max_features="auto", random_state=0)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(n_estimators_list, RMSE, marker="o")
plt.xlabel('n_estimators')
plt.ylabel('RMSE value')
plt.title('n_estimators vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

It look like RMSE is minimized where 105<=n_estimators<=115, so let's search minimum value by running code again as 105<=n_estimators<=115.

In [None]:
RMSE = []
n_estimators_list = [i for i in range(105, 116)]

for n_estimators in n_estimators_list:
    model = GradientBoostingRegressor(loss="huber", max_depth=8, n_estimators=n_estimators, max_features="auto", random_state=0)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(n_estimators_list, RMSE, marker="o")
plt.xlabel('n_estimators')
plt.ylabel('RMSE value')
plt.title('n_estimators vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

The graph says that **n_estimators=111** is best parameter When n_estimators=111 is set, RMSE is minimized.

**Conclusion is that best paramer to minimize RMSE: loss="huber", max_features="auto", max_depth=8, n_estimators=111**

# 5. Tuning Random Forest Model

> # 1. Define parameter "max_features"

In [None]:
RMSE = []
max_features_list = ["auto", "sqrt", "log2"]
for max_features in max_features_list:
    model = RandomForestRegressor(max_features=max_features, random_state=0, n_jobs=-1)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(max_features_list, RMSE, marker="o")
plt.xlabel('max_features')
plt.ylabel('RMSE value')
plt.title('max_features vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

Best parameter is either "sqrt" or "log2". Let's use max_features="sqrt".

> # 2. Define paramter "max_depth"
On condition that max_features="sqrt"

In [None]:
RMSE = []
max_depth_list = [i for i in range(5, 36, 5)]
for max_depth in max_depth_list:
    model = RandomForestRegressor(max_features="sqrt", max_depth=max_depth, random_state=0, n_jobs=-1)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(max_depth_list, RMSE, marker="o")
plt.xlabel('max_depth')
plt.ylabel('RMSE value')
plt.title('max_depth vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

In [None]:
RMSE = []
max_depth_list = [i for i in range(15, 26)]
for max_depth in max_depth_list:
    model = RandomForestRegressor(max_features="sqrt", max_depth=max_depth, random_state=0, n_jobs=-1)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(max_depth_list, RMSE, marker="o")
plt.xlabel('max_depth')
plt.ylabel('RMSE value')
plt.title('max_depth vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

The graph says that **max_depth=22** is best parameter. When max_depth=22 is set, RMSE is minimized.

> # 3. Define paramter "n_estimators"
On condition that max_features="sqrt" and max_depth=22

In [None]:
RMSE = []
n_estimators_list = [i for i in range(150, 251,10)]
for n_estimators in n_estimators_list:
    model = RandomForestRegressor(max_features="sqrt", max_depth=22, n_estimators=n_estimators, random_state=0, n_jobs=-1)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(n_estimators_list, RMSE, marker="o")
plt.xlabel('n_estimators')
plt.ylabel('RMSE value')
plt.title('n_estimators vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

It look like RMSE is minimized where 110<=n_estimators<=113, so let's search minimum value by running code again as 110<=n_estimators<=113.

In [None]:
RMSE = []
n_estimators_list = [i for i in range(125, 136)]
for n_estimators in n_estimators_list:
    model = RandomForestRegressor(max_features="sqrt", max_depth=22, n_estimators=n_estimators, random_state=0, n_jobs=-1)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(n_estimators_list, RMSE, marker="o")
plt.xlabel('n_estimators')
plt.ylabel('RMSE value')
plt.title('n_estimators vs RMSE value', fontsize=12)
plt.grid()
plt.show()

The graph says that n_estimators=112 is best parameter When n_estimators=112 is set, RMSE is minimized.

**Conclusion is that best paramer to minimize RMSE: max_depth=18, n_estimators=112**

> # 4. Define parameter "min_sample_split"

In [None]:
RMSE = []
min_samples_split_list = [2, 3, 4, 5, 6, 7]
for min_samples_split in min_samples_split_list:
    model = RandomForestRegressor(max_features="sqrt", max_depth=22, n_estimators=n_estimators, 
                                  min_samples_split=min_samples_split, random_state=0, n_jobs=-1)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    score = mean_squared_error(y_test, y_predict, squared=False)
    RMSE.append(score)

plt.plot(max_features_list, RMSE, marker="o")
plt.xlabel('max_features')
plt.ylabel('RMSE value')
plt.title('max_features vs RMSE value', fontsize=12)
plt.grid()
plt.show()
print(RMSE)

# 6. Comparison of Tuned and Default Models

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

RF_RMSE = []
GB_RMSE = []
legend_list = ["Default", "Tuned"]

GB_models = [GradientBoostingRegressor(random_state=0), 
             GradientBoostingRegressor(loss="huber", max_features="auto", max_depth=8, n_estimators=111,  random_state=0)]
RF_models = [RandomForestRegressor(n_jobs=-1, random_state=0), 
             RandomForestRegressor(max_features="sqrt", max_depth=22, n_estimators=112, min_samples_split=3, random_state=0, n_jobs=-1)]

for model in GB_models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    score = mean_squared_error(y_test, y_pred, squared=False)
    GB_RMSE.append(score)

for model in RF_models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    score = mean_squared_error(y_test, y_pred, squared=False)
    RF_RMSE.append(score)

print(f"GB Default model score: {GB_RMSE[0]}")
print(f"GB Tuned-model score: {GB_RMSE[1]}")
print()
print(f"RF Default model score: {RF_RMSE[0]}")
print(f"RF Tuned-model score: {RF_RMSE[1]}")

plt.figure(figsize=(8,6)) 
plt.plot(legend_list, GB_RMSE, label="GB:Gradient Boosting", marker="o")
plt.plot(legend_list, RF_RMSE, label="RF:Random Forest", marker="^")
plt.xlabel('name')
plt.ylabel('RMSE value')
plt.title("RMSE Visualization", fontsize=16)
plt.legend()
plt.show()

Use Tuned-Random Forest regressor to predict the values.

# 7. Predict and Input Data based on Tuned-Gradient Boosting Model

In [None]:
final_model = GradientBoostingRegressor(loss="huber", max_features="auto", max_depth=8, n_estimators=111,  random_state=0)
final_model.fit(x_train, y_train)
y_pred = final_model.predict(x_test)
final_score = mean_squared_error(y_test, y_pred, squared=False)
print(f"GB Tuned-model score: {final_score}")

submission['target'] = final_model.predict(test)
submission.to_csv('submission.csv')

df = pd.read_csv('submission.csv')
df.head()