**This notebook was written in Google Colab**

> **Note:** Some code is commented out because it is only executable in the Google Colab project.

# Tabular Playground Series - August 2021

This is the August challenge from the Tabular Playground Series monthly machine learning challenges on Kaggle.
It is a beginner friendly challenge, for gaining skills in machine learning.

## 1. Problem Definition

> Try to achieve the lowest loss in the challenge leaderboards.

## 2. Data

The data I'll be using can be found on Kaggle: https://www.kaggle.com/c/tabular-playground-series-aug-2021/data

## 3. Evaluation

> Submissions are scored on the root mean squared error.

## 4. Features

The dataset we're using...
* is structured.
* is pre-split into training and test datasets.
* contains 100 columns of features.
* contains 1 label column named `loss`.

Let's import the libraries we will be using and then check out the data.

### Importing libraries

In [None]:
import json
import pickle as pkl

from itertools import product

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor

### Importing the data

In [None]:
#!unzip "drive/MyDrive/TPS August/tabular-playground-series-aug-2021.zip" -d "drive/MyDrive/TPS August/data"

In [None]:
#train_df = pd.read_csv("/content/drive/MyDrive/TPS August/data/train.csv")
train_df = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
train_df.head()

In [None]:
train_df.drop("id", axis=1, inplace=True)
train_df.head(2)

In [None]:
#test_df = pd.read_csv("/content/drive/MyDrive/TPS August/data/test.csv")
test_df = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")
test_df.head()

In [None]:
test_df.index = test_df.id
test_df.drop("id", axis=1, inplace=True)
test_df.head(2)

In [None]:
#submit_df = pd.read_csv("/content/drive/MyDrive/TPS August/data/sample_submission.csv")
submit_df = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")
submit_df.head()

### Splitting the data

In [None]:
X, y = np.array(train_df.drop("loss", axis=1)), np.array(train_df.loss)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=3)
len(X_train), len(X_val)

## 5. Baseline hyperparameter modeling

### Hyperparameter search

In [None]:
def score_model(model, X, y):
  preds = model.predict(X)
  score = mean_squared_error(y, preds, squared=False)
  return score

In [None]:
def save_pkl_data(data, filepath="/content/drive/MyDrive/TPS August/logs/model-stats.p"):
  pkl.dump(data, open(filepath, "wb" ))

def load_pkl_data(filepath="/content/drive/MyDrive/TPS August/logs/model-stats.p"):
  return pkl.load(open(filepath, "rb"))

In [None]:
def single_split_scoring(X_train, X_val, y_train, y_val, max_depths=[3], learning_rates=[0.1], gammas=[0], min_child_weights=[1], subsamples=[1], colsample_bytrees=[1], n_estimatorss=[100], filepath="/content/drive/MyDrive/TPS August/logs/model-stats.p"):
  model_params = {}
  model_train_scores = {}
  model_valid_scores = {}

  eval_set = [(X_train, y_train), (X_val, y_val)]
  eval_metric = "rmse"

  param_combinations = product(max_depths, learning_rates, gammas, min_child_weights, subsamples, colsample_bytrees, n_estimatorss)
  num_combinations = len(max_depths)*len(learning_rates)*len(gammas)*len(min_child_weights)*len(subsamples)*len(colsample_bytrees)*len(n_estimatorss)
  print(f"Testing {num_combinations} total combinations")
  
  for i, (max_depth, learning_rate, gamma, min_child_weight, subsample, colsample_bytree, n_estimators) in enumerate(param_combinations):
    print(f"\nTest {i+1} of {num_combinations}\n")
    
    model = XGBRegressor(max_depth=max_depth,
                         learning_rate=learning_rate,
                         gamma=gamma,
                         min_child_weight=min_child_weight,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree,
                         n_estimators=n_estimators,
                         objective="reg:squarederror",
                         verbosity=1,
                         seed=3)

    model.fit(X=X_train,
              y=y_train,
              eval_set=eval_set,
              eval_metric=eval_metric,
              early_stopping_rounds=10)
    
    model_params[i] = model.get_params()
    model_train_scores[i] = score_model(model, X_train, y_train)
    model_valid_scores[i] = score_model(model, X_val, y_val)
  
    model_stats = {
      "parameters": model_params,
      "train_scores": model_train_scores,
      "validation_scores": model_valid_scores
    }
    save_pkl_data(model_stats, filepath)

  return model_stats

In [None]:
# model_stats = single_split_scoring(X_train, X_val, y_train, y_val,
#                                    max_depths=[3, 4], 
#                                    learning_rates=[0.1, 0.05], 
#                                    gammas=[0, 1], 
#                                    min_child_weights=[0, 1], 
#                                    subsamples=[0.5, 0.6], 
#                                    colsample_bytrees=[0, 0.5], 
#                                    n_estimatorss=[100,120],
#                                    filepath="/content/drive/MyDrive/TPS August/logs/model-stats-1.p")

### Evaluation

In [None]:
#loaded_model_stats = load_pkl_data("/content/drive/MyDrive/TPS August/logs/model-stats-1.p")
loaded_model_stats = load_pkl_data("../input/modelstats/model-stats-1.p")

In [None]:
def show_top_models(model_stats):
  parameters = np.array(list(model_stats["parameters"].values()))
  train_scores = np.array(list(model_stats["train_scores"].values()))
  valid_scores = np.array(list(model_stats["validation_scores"].values()))

  top_5_indices = valid_scores.argsort()[:5]

  for i, index in enumerate(top_5_indices):
    print(f"Model index: {index} - Ranking: {i+1}")
    print(f"Validation score: {valid_scores[index]}")
    print(f"Training score: {train_scores[index]}")
    print(f"Parameters: {parameters[index]}")
    print("\n")

In [None]:
show_top_models(loaded_model_stats)

## 6. Experimentation

### Second tune

In [None]:
# single_split_scoring(X_train, X_val, y_train, y_val,
#                       max_depths=[4,5], 
#                       learning_rates=[0.1], 
#                       gammas=[0.5, 0, 1, 2], 
#                       min_child_weights=[0, 1, 2], 
#                       subsamples=[0.5, 0.4], 
#                       colsample_bytrees=[0.7, 0.5], 
#                       n_estimatorss=[150,200],
#                       filepath="/content/drive/MyDrive/TPS August/logs/model-stats-2.p")

In [None]:
#model_stats_2 = load_pkl_data("/content/drive/MyDrive/TPS August/logs/model-stats-2.p")
model_stats_2 = load_pkl_data("../input/modelstats/model-stats-2.p")

In [None]:
show_top_models(model_stats_2)

### Third tune

In [None]:
# single_split_scoring(X_train, X_val, y_train, y_val,
#                       max_depths=[4, 5], 
#                       learning_rates=[0.1], 
#                       gammas=[0.5], 
#                       min_child_weights=[2], 
#                       subsamples=[0.5], 
#                       colsample_bytrees=[0.5, 0.7], 
#                       n_estimatorss=[200, 250],
#                       filepath="/content/drive/MyDrive/TPS August/logs/model-stats-3.p")

In [None]:
#model_stats_3 = load_pkl_data("/content/drive/MyDrive/TPS August/logs/model-stats-3.p")
model_stats_3 = load_pkl_data("../input/modelstats/model-stats-3.p")

In [None]:
show_top_models(model_stats_3)

## 7. Model Finalizing

### Building and training a final model

In [None]:
def train_model(X, y, max_depth, learning_rate, gamma, min_child_weight, subsample, colsample_bytree, n_estimators):
  model = XGBRegressor(max_depth=max_depth,
                       learning_rate=learning_rate,
                       gamma=gamma,
                       min_child_weight=min_child_weight,
                       subsample=subsample,
                       colsample_bytree=colsample_bytree,
                       n_estimators=n_estimators,
                       objective="reg:squarederror",
                       verbosity=3,
                       seed=3)
  model.fit(X, y)
  return model

In [None]:
# Best params: {'colsample_bytree': 0.5, 'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 200, 'subsample': 0.5}

final_model = train_model(X, y,
                          max_depth=4,
                          learning_rate=0.1,
                          gamma=0.5,
                          min_child_weight=2,
                          subsample=0.5,
                          colsample_bytree=0.5,
                          n_estimators=250)

In [None]:
#save_pkl_data(final_model, "drive/MyDrive/TPS August/models/hyperparameter-trained-3-tunes.p")

In [None]:
#loaded_model = load_pkl_data("drive/MyDrive/TPS August/models/hyperparameter-trained-3-tunes.p")

In [None]:
#score_model(loaded_model, X, y)
score_model(final_model, X, y)

### Getting predictions

In [None]:
#predictions = loaded_model.predict(test_df)
predictions = final_model.predict(test_df)

In [None]:
submit_df.head()

In [None]:
submit_df.loss = predictions
submit_df.head()

In [None]:
submit_df.to_csv("submission.csv",
                 index=False)