In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualisation
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
    
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Libraries that will be used for training and predicting
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, FeaturesData, Pool
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## Objective: 
Improving the model to recieve better scoring, and achieve better understanding of CatBoost, XGBoost and Cross Validation

## Strategy:
To use the XGBoost and Catboost regressors with Cross validation

## Tactics:

1) Re-Write a function for scoring the models, based on the new models

2) Write hyper parameter tuning of each models and identify the parameters

3) After Hyper parameter tuning start the Cross Validation

4) Apply the CV model to the final X_test and submit

5) We will consider the reduced dimensions from the beginning i.e base_features = ['f52','f77','f81','f13']

In [None]:
# Using standard naming convention will improve the understandability of the notebook and "human" learning. 

train_data = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv',index_col=0)
test_data  = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv', index_col=0)
sample     = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')
base_features = ['f52','f77','f81','f13']

X_full = train_data[base_features].copy()
y_full = train_data.pop('loss')

train_X, val_X, train_y, val_y = train_test_split(X_full, y_full,random_state = 0) #Default is 33.33% split, which is verified below

In [None]:
#Check the shape of the training, and validation data for understanding data
print(train_X.shape) # Data used for fitting the model
print(val_X.shape) # Evaluating the fitted model for scores
print(train_y.shape) #Data used for fitting the model 
print(val_y.shape)  # Evaluating the fitted model for scores
print("Percentage of validation to training data is:",(val_y.shape[0]/train_y.shape[0])*100)

## XGBoost Regressor

#The Random Forest model has further improved the score, even though it is marginal. The next model, I have been hearing is the best in the industry, and is used to win competitions. XGBoost or Extreme Gradient Booster. 

This algorithm also has multiple variables that are unrelated and needs to be tune individually. 

In this model we will only use the base features that were selected from the Decision Tree Regressor, and improve the model.

## There are 4 important parameters to be considered for XGBoost

1) n_estimators : Number of iterations the learning process will occur, usually (100 to 1000)

2) learning rate: Rate at which the learning happens (0.01 to 0.001)

3) n_jobs : Whether multicore is used (3 to 5)

4) Early Stoping : If there is no improvement then stop early.(5 minimum)

All these parameters are independent. 

In [None]:
#Considering the following parameters, and tuning only learning rate and estimator
#1) n_estimators : 500
#2) learning rate: 0.1
#3) n_jobs : 4 (running on Kaggle, so unsure how many cores)
#4) Early Stoping : 5

In [None]:
def get_estimator(est, train_X, val_X, train_y, val_y):
    base_model = XGBRegressor(n_estimators=est, learning_rate=0.1, n_jobs=4)
    base_model.fit(train_X, train_y)
    preds_val = base_model.predict(val_X)
    rmse = mean_squared_error(val_y, preds_val,squared=False)
    return(rmse)

In [None]:
#Taking best estimator 50 and running best learning rate
def get_lr(lr, train_X, val_X, train_y, val_y):
    base_model = XGBRegressor(n_estimators=50, learning_rate=lr, n_jobs=4)
    base_model.fit(train_X, train_y)
    preds_val = base_model.predict(val_X)
    rmse = mean_squared_error(val_y, preds_val,squared=False)
    return(rmse)

In [None]:
max_est= [10,50,100,200,500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
best_est = []
for est in max_est:
    best_est.append(get_estimator(est,train_X, val_X, train_y, val_y))

In [None]:
max_lr= [0.01,0.05,0.1,0.5,0.7,1]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
best_lr = []
for lr in max_lr:
    best_lr.append(get_lr(lr,train_X, val_X, train_y, val_y))

In [None]:
# Store the best value of max_estimators (it will be one of the max_est)
final_est = max_est[best_est.index(min(best_est))]
print("The best estimator is :",final_est)

In [None]:
# Store the best value of max_estimators (it will be one of the max_est)
final_lr = max_lr[best_lr.index(min(best_lr))]
print("The best Learning rate is:",final_lr)

In [None]:
plt.title('Max estimator plotting')
plt.plot(best_est)
plt.xlabel('estimator')
plt.ylabel('rmse')

In [None]:
plt.title('Max learningRate plotting')
plt.plot(best_lr)
plt.xlabel('learningRate')
plt.ylabel('rmse')

In [None]:
# The estimator has been identified as 50, and LR as 0.1. Entering Cross validation
# Multiply by -1 since sklearn calculates *negative* MAE

my_pipeline = Pipeline(steps=[('model',  XGBRegressor(n_estimators=50, learning_rate=0.1, n_jobs=4))])

In [None]:
scores = -1 * cross_val_score(my_pipeline, X_full,y=y_full,
                              cv = 5,
                              scoring='neg_root_mean_squared_error')

print("MAE scores:\n", scores)

In [None]:
#Since the pipeline is already tested, we can use the full data also.
my_pipeline.fit(X_full, y_full)
test_pred = my_pipeline.predict(test_data[base_features])

In [None]:
my_submission = pd.DataFrame({'id': test_data.index, 'loss':test_pred})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)