This Notebook is my attempt at the Kaggle Monthly challenge for August.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

import plotly.figure_factory as ff
from plotnine import *
import plotnine as pn

import seaborn as sns

In [None]:
# read data
train  = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')

### 1. Data Overview

This section provides a summary of the data.

**Summary**
* Total number of observations in the train data is 250,000. Observations to be predicted on (test data) is 150,000.
* Luckily, there are no values in both the train and test data set. We can spend more time on the actual model building.
* Overall, there are 101 features (named from f0 to f99) that we will be used to predict the target variable (loss). Includes an 'id' column of type integer, which from a cursory look is just a row identifier.
* Of the 100 features, not including the id column, 95 are float and 5 are integers.
* The target variable (loss) is an integer variable.

In [None]:
shape_df = pd.DataFrame({'Data':['Train','Test'],
                       'Shape':[train.shape[0], test.shape[0]]})
colors = ['#FFBF00','#40E0D0']
data = go.Bar(x =shape_df.Shape[::-1],y=shape_df.Data[::-1], orientation='h', text=shape_df.Shape[::-1], textposition='auto', marker_color=colors)
layout = go.Layout(font=dict(family='Arial',size=14),
                  paper_bgcolor='white',
                  plot_bgcolor = '#FFFAFA',
                 showlegend=False,width=800, height=400,title='Train & Test Data Size')
fig = go.Figure(data=data, layout=layout)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')
fig.show()

In [None]:
# Data attributes
print('Total null values in train data:', train.isna().sum().sum())
print('Total null values in test data:', test.isna().sum().sum())

In [None]:
# get column dtype counts
labels = ['float','int']
values = [95,5]
data = go.Pie(labels=labels, values=values,pull=[0.2,0],textinfo='label+value', marker=dict(colors=colors))
layout = go.Layout(font=dict(family='Arial',size=14),
                  paper_bgcolor='white',
                  plot_bgcolor = '#FFFAFA',
                 showlegend=False,title='Feature Count by Data Type', height=500, width=500)
fig = go.Figure(data=data,layout=layout)
fig.show()

### 2. Target (Loss) Variable

**Summary**
* The target variable has 43 unique values i.e. from 0 - 42. Interesting!
* Target value '0' constitutes approximately 25% of the data. 
* And values 0,1,2,3,4 make up for 50% of the dataset the datatset. It may be worthwhile to explore the features for these values separately.

It is very tempting to look at the problem from a classification point of view. But since the evaluation metric is RMSE we have to stick with building regression models.

In [None]:
target_counts = train.groupby('loss')['loss'].count()
target_counts_df = pd.DataFrame({'Target Value':target_counts.index, 'Count':target_counts.values})
target_counts_df['Pct'] = target_counts_df['Count']/train.shape[0]

target_counts_df['Target Value'] = target_counts_df['Target Value'].astype('str')

fig = make_subplots(rows=1, cols=2, subplot_titles=("Target Value Counts", "Percentage Target Values"),horizontal_spacing=0.05)

fig.add_trace(go.Bar(x = target_counts_df.Count[::-1], y=target_counts_df['Target Value'][::-1], orientation='h',
                    text = target_counts_df.Count[::-1], textposition='outside', marker_color='#FFBF00'),row=1,col=1)

fig.add_trace(go.Bar(x = target_counts_df.Pct[::-1], y=target_counts_df['Target Value'][::-1], orientation='h',
                    text = target_counts_df.Pct[::-1], textposition='outside', marker_color='#40E0D0'),row=1,col=2)

fig.update_layout(font=dict(family='Arial'),
                  paper_bgcolor='white',
                  plot_bgcolor = '#FFFAFA',
                 showlegend=False,height=900)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')

fig.show()
plt.savefig('Target Values.png')

### 3. Features

We will explore the features to further understand the data.

### 3.1 Unique Values
The unique values for the integer features were evaluated for starters.

**Findings**
* Features f1 and f86 have the least number of unique values and can be converted to a classification column.
* The distribution between the train and test data for unique values for the integer features is comparable.

In [None]:
# unique value counts for the integer features
int_df = train.select_dtypes(include=['int64']).drop(['loss','id'],axis=1)

int_col_list = int_df.columns

unique_value_dict = {}

train_feats = []
train_unique_values = []

test_feats = []
test_unique_values = []

for col in int_col_list:
    train_feats.append(col)
    test_feats.append(col)
    
    train_unique_values.append(train[col].nunique())
    test_unique_values.append(test[col].nunique())
    
train_int_feats_unique_vals = pd.DataFrame({'Feature':train_feats,'Unique Values':train_unique_values})
test_int_feats_unique_vals = pd.DataFrame({'Feature':test_feats,'Unique Values':test_unique_values})

fig = make_subplots(rows=1, cols=2, subplot_titles=("Train Data", "Test Data"),horizontal_spacing=0.05)

fig.add_trace(go.Bar(x = train_int_feats_unique_vals['Unique Values'][::-1], y=train_int_feats_unique_vals['Feature'][::-1], orientation='h',
                    text = train_int_feats_unique_vals['Unique Values'][::-1], textposition='auto', marker_color='#FFBF00'),row=1,col=1)
fig.add_trace(go.Bar(x = test_int_feats_unique_vals['Unique Values'][::-1], y=test_int_feats_unique_vals['Feature'][::-1], orientation='h',
                    text = test_int_feats_unique_vals['Unique Values'][::-1], textposition='auto', marker_color='#40E0D0'),row=1,col=2)

fig.update_layout(font=dict(family='Arial'),
                  paper_bgcolor='white',
                  plot_bgcolor = '#FFFAFA',
                 showlegend=False,title='Unique Value Count for Integer Features')

fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')

fig.show()
plt.savefig('integer_features.png')

### 3.2 Feature Distribution

We will compare the feature distribution for the train and test data. Of course, we will do this only for the 95 features that are float type. Yellow shows train data and green indicates test data.

In [None]:
feat_list = train.select_dtypes(include=['float64']).columns

feat_list_set1 = feat_list[0:20]

fig, axes = plt.subplots(5, 4,figsize=(21, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set1[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        n += 1
plt.tight_layout()

In [None]:
feat_list_set2 = feat_list[20:40]

fig, axes = plt.subplots(5, 4,figsize=(20, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set2[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        
        n += 1
plt.tight_layout()

In [None]:
feat_list_set3 = feat_list[40:60]

fig, axes = plt.subplots(5, 4,figsize=(20, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set3[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        
        n += 1
plt.tight_layout()

In [None]:
feat_list_set4 = feat_list[60:80]

fig, axes = plt.subplots(5, 4,figsize=(20, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(4):
        feat = feat_list_set4[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        
        n += 1
        
plt.tight_layout()

In [None]:
feat_list_set5 = feat_list[80:96]

fig, axes = plt.subplots(5, 3,figsize=(20, 20))

n = 0
sns.despine()
for row in range(5):
    for col in range(3):
        feat = feat_list_set5[n] 
        sns.kdeplot(train[feat],shade=True, color="#FFBF00", alpha=0.1, ax=axes[row,col])
        sns.kdeplot(test[feat], shade=True, color="#40E0D0", alpha=0.1, ax=axes[row,col])
        axes[row,col].set(xlabel='', ylabel='')
        axes[row,col].set_title('Feature: '+str(feat),fontdict= { 'fontsize': 12, 'fontweight':'bold'})
        
        n += 1
        
plt.tight_layout()

Overall, the features are distributed very identically between the train and test data.


In [None]:
# get data
target_0 = train[train.loss == 0]
target_1 = train[train.loss == 1]
target_2 = train[train.loss == 2]
target_3 = train[train.loss == 3]
target_4 = train[train.loss == 4]

### 3.3 Feature Correlation

In [None]:
corr = train.corr()

f, ax = plt.subplots(figsize=(16, 16))

cmap = sns.diverging_palette(230, 20, as_cmap=True)

mask = np.triu(np.ones_like(corr, dtype=bool))

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Looks like none of the features are correlated.

Okay then lets start modeling....

## 4.Models

This section is all about modeling. Since there is not much of data cleaning involved, we can jump right in. There are a few things I would like to try:
* XGBoost
* CatBoost
* LGBM
* Hyperparameter Tuning with Optuna - since I have never done it before

This is the plan:
- Run XGBoost, CatBoost and LGBM with all features
- Get best parameters using Optuna
- Based on the best parameters run all three models and pick the top features using Boruta
- Voting regressor with all the three models based on the best estimator and the best features

At some point in time I would like to try auto-ML as well.

If these work then well and good. Otherwise I am not going to sweat it.

### 4.1 XGBoost with Optuna

In [None]:
# import libraries
import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler
import xgboost as xgb

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
# get data for model
feats = train.drop(['id','loss'], axis=1)
target = train['loss']

In [None]:
# define function for tuning
def objective(trial,data=feats,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.02,random_state=42)
    param = {
        'tweedie_variance_power': trial.suggest_discrete_uniform('tweedie_variance_power', 1.0, 2.0, 0.1),
        'max_depth': trial.suggest_int('max_depth', 6, 10), # Extremely prone to overfitting!
        'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400), # Extremely prone to overfitting!
        'eta': trial.suggest_float('eta', 0.007, 0.013), 
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), 
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4), # L2 regularization
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), # L1 regularization
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4),
        'tree_method':'gpu_hist'
    } 
    model = xgb.XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
# #Optuna tuning - commenting to save time 
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
# %%time
# # train XGB model with best parameters from Optuna
# xgb_best_params = {'lambda': 0.014133915031135962, 'alpha': 0.1247308992037035, 'colsample_bytree': 0.8, 
#                'subsample': 0.7, 'learning_rate': 0.009, 'n_estimators': 2000, 'max_depth': 11, 'random_state': 48, 
#                'min_child_weight': 114,'tree_method':'gpu_hist'}

# # fit model with Optuna best parameters
# train_x, val_x, train_y, val_y = train_test_split(feats, target, test_size=0.2,random_state=42)

# xgb_model = xgb.XGBRegressor(**xgb_best_params)
# xgb_model.fit(train_x, train_y,
#               eval_set=[(train_x, train_y), (val_x, val_y)],
#               eval_metric="rmse",
#               early_stopping_rounds=100,
#               verbose=False)

### 4.2 Catboost

In [None]:
# Optuna objective function for Catboost
from catboost import CatBoostRegressor
def objective(trial,data=feats,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    param = {'iterations':trial.suggest_int("iterations", 1000, 20000),
              'od_wait':trial.suggest_int('od_wait', 500, 2000),
              'task_type':"GPU",
              'eval_metric':'RMSE',
              'learning_rate' : trial.suggest_uniform('learning_rate', 0.03 , 0.04),
              'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.32 , 0.33),
              'subsample': trial.suggest_uniform('subsample',0.9,1.0),
              'random_strength': trial.suggest_uniform('random_strength',10,50),
              'depth': trial.suggest_int('depth',1,15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
             'task_type': 'GPU'
               }
    
    model = CatBoostRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=200,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
# commenting out since we know the parameters
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
# fit catboost model with best trial
# best_trial_catboost = {'l2_leaf_reg': 3.8375761158656756, 'max_bin': 345, 
#                        'learning_rate': 0.008643325596038125, 'n_estimators': 10000, 'max_depth': 10, 'random_state': 2020, 
#                        'min_data_in_leaf': 85,'task_type':'GPU'}

# catboost_model = CatBoostRegressor(**best_trial_catboost)


# train_x, val_x, train_y, val_y = train_test_split(feats, target, test_size=0.2,random_state=42)

# catboost_model.fit(train_x,train_y,eval_set=[(val_x,val_y)],early_stopping_rounds=200,verbose=False)

### 4.3 LightGBM

In [None]:
from lightgbm import LGBMRegressor

def objective(trial,data=feats,target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 0.47 , 0.5),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 0.32 , 0.33),
        'num_leaves' : trial.suggest_int('num_leaves' , 50 , 70),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.03 , 0.04),
        'max_depth' : trial.suggest_int('max_depth', 30 , 40),
        'n_estimators' : trial.suggest_int('n_estimators', 100 , 6100),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.015 , 0.02),
        'subsample' : trial.suggest_uniform('subsample' , 0.9 , 1.0), 
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.52 , 1),
        'min_child_samples' : trial.suggest_int('min_child_samples', 76, 80),
        'metric' : 'rmse',
        'device_type' : 'gpu',
    }
    model = LGBMRegressor(**params, random_state=2021)  
    
    model.fit(train_x, train_y,eval_set=[(test_x,test_y)], early_stopping_rounds=150, verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

In [None]:
# commenting out since we know the parameters
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=50)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

### 4.4 Feature Selection with Boruta

Boruta has shown recent success in removing noisy features. And since we have no information on the features, it may not a bad idea to select useful features using the Optuna Optimized parameters.

In [None]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [None]:
# best parameters based on Optuna tuning
xgboost_best_trial = {'max_depth': 10, 'n_estimators': 3200, 'eta': 0.012940730944195646, 
                      'subsample': 0.6000000000000001, 'colsample_bytree': 0.4, 'colsample_bylevel': 0.8, 
                      'min_child_weight': 0.9447548982540029, 'reg_lambda': 0.003408075115131724, 'reg_alpha': 62.732265933421566, 
                      'gamma': 24.458977256371472,'tree_method':'gpu_hist'}

catBoost_best_trial = {'iterations': 9103, 'od_wait': 1040, 'learning_rate': 0.030857734253353943, 'reg_lambda': 0.3227059153305604, 
                       'subsample': 0.9222810313136256, 'random_strength': 41.37914819936549, 'depth': 11, 'min_data_in_leaf': 8, 
                       'leaf_estimation_iterations': 12,'task_type': 'GPU'}


lgbm_best_trial = {'reg_alpha': 0.4865035063664912, 'reg_lambda': 0.32183129255516457, 'num_leaves': 66, 'learning_rate': 0.03383743020793445, 'max_depth': 35, 
                   'n_estimators': 3786, 'min_child_weight': 0.015685452453452698, 'subsample': 0.9384580741116457, 'colsample_bytree': 0.7819836028701871, 
                   'min_child_samples': 77,'device_type' : 'gpu'}


xgb_model = XGBRegressor(**xgboost_best_trial)
cat_model = CatBoostRegressor(**catBoost_best_trial)
lgb_model = LGBMRegressor(**lgbm_best_trial)

In [None]:
feature_names =  feats.columns.to_list()

In [None]:
# %%time - takes 8 min to run
# we will use Boruta with three regressors and then select the best features - all models
from boruta import BorutaPy

train_x, test_x, train_y, test_y = train_test_split(feats, target, test_size=0.2,random_state=42)

boruta_selector_xgb = BorutaPy(xgb_model, n_estimators = 'auto', random_state = 0)
boruta_selector_xgb.fit(np.array(train_x),np.array(train_y))

In [None]:
# create a XGB boruta ranking df
boruta_ranking_xgb = boruta_selector_xgb.ranking_

boruta_ranking_xgb_df = pd.DataFrame(data=boruta_ranking_xgb, index=train_x.columns.values, columns=['values'])
boruta_ranking_xgb_df['Variable'] = boruta_ranking_xgb_df.index
boruta_ranking_xgb_df.sort_values(['values'], ascending=True, inplace=True)

In [None]:
# boruta for cat boost
# boruta_selector_cb = BorutaPy(cat_model, n_estimators = 'auto', random_state = 0)
# boruta_selector_cb.fit(np.array(train_x),np.array(train_y))

In [None]:
boruta_selector_lgb = BorutaPy(lgb_model, n_estimators = 'auto', random_state = 0)
boruta_selector_lgb.fit(np.array(train_x),np.array(train_y))

# create a LGBM boruta ranking df
boruta_ranking_lgb = boruta_selector_lgb.ranking_

boruta_ranking_lgb_df = pd.DataFrame(data=boruta_ranking_lgb, index=train_x.columns.values, columns=['values'])
boruta_ranking_lgb_df['Variable'] = boruta_ranking_lgb_df.index
boruta_ranking_lgb_df.sort_values(['values'], ascending=True, inplace=True)

In [None]:
# top features - where boruta ranked 1 or 2
boruta_ranking_xgb_df = boruta_ranking_xgb_df.rename(columns={'values':'Value'})
boruta_ranking_lgb_df = boruta_ranking_lgb_df.rename(columns={'values':'Value'})

boruta_ranking_xgb_select = boruta_ranking_xgb_df[boruta_ranking_xgb_df.Value <= 2]
boruta_ranking_lgb_select = boruta_ranking_lgb_df[boruta_ranking_lgb_df.Value <= 2]

In [None]:
# plot feature importance for XGB and LGB
fig = make_subplots(rows=1, cols=2, subplot_titles=("Feature Importance - XGB", "Feature Importance - LGBM"),horizontal_spacing=0.05)

fig.add_trace(go.Bar(x = boruta_ranking_xgb_select.Value[::-1], y=boruta_ranking_xgb_select['Variable'][::-1], orientation='h',
                    text = boruta_ranking_xgb_select.Value[::-1], textposition='outside', marker_color='#FFBF00'),row=1,col=1)

fig.add_trace(go.Bar(x = boruta_ranking_lgb_select.Value[::-1], y=boruta_ranking_lgb_select['Variable'][::-1], orientation='h',
                    text =boruta_ranking_lgb_select.Value[::-1], textposition='outside', marker_color='#40E0D0'),row=1,col=2)

fig.update_layout(font=dict(family='Arial'),
                  paper_bgcolor='white',
                  plot_bgcolor = '#FFFAFA',
                 showlegend=False,height=900)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')
fig.update_yaxes(showline=True, linewidth=2, linecolor='black')

fig.show()
plt.savefig('Target Values.png')

The above plot shows the ranked feature importances based on Boruta. There is a significant difference in between the LGBM and XGB models. As a first step I will use thge LGBM ranked features since I trust the model more based on the baseline results.

## 5. Predict & Submit Baseline Model

In [None]:
# predict test data
# test_model = test[feat_req]

# predictions = catboost_model.predict(test_model)

In [None]:
# make submission file
# sub = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
# sub['loss'] = predictions

In [None]:
# save submission file
# sub.to_csv('submission.csv', index=False)

**Keep Learning and have fun!**