<div>
    <h1 align="center">XGBoost & CatBoost</h1>    
    <h1 align="center">Tabular Playground Series - Aug 2021</h1> 
</div>

<div class="alert alert-success">  
</div>

<div class="alert alert-success">
    <h1 align="center">If you find this work useful, please don't forget upvoting :)</h1>
</div>

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.express as px

%matplotlib inline

In [None]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor

import optuna

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import LabelEncoder
from mlxtend.preprocessing import minmax_scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneGroupOut


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

<div class="alert alert-success">  
</div>

## Data Set

In [None]:
DF1 = pd.read_csv('../input/tabular-playground-series-aug-2021/train.csv')

DF2 = pd.read_csv('../input/tabular-playground-series-aug-2021/test.csv')

SAM = pd.read_csv('../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
MV1 = DF1.isnull().sum()
MV2 = DF2.isnull().sum()

print(f'Missing Value 1:  {MV1[MV1 > 0]}')
print(f'Missing Value 2:  {MV2[MV2 > 0]}')

In [None]:
display(DF1, DF2)

# display(DF1.describe().transpose())
# display(DF2.describe().transpose())

In [None]:
print('=' * 40)
DF1.info(memory_usage='deep')
print('=' * 40)
DF2.info(memory_usage='deep')
print('=' * 40)

<div class="alert alert-success">  
</div>

In [None]:
data1 = DF1.copy()
data2 = DF2.copy()

columns = data2.columns[1:]
display(columns)

In [None]:
data1['loss'].value_counts().plot(figsize=(16, 8), kind='bar')

In [None]:
data1['loss'].value_counts().plot(figsize=(10, 10), kind='pie')

data1['loss'].value_counts(normalize=True)

<div class="alert alert-success">  
</div>

In [None]:
X = data1.drop(columns = ['id','loss'])
display(X)

In [None]:
y = data1.loss
display(y)

In [None]:
display(y.min() , y.max())

In [None]:
XX = data2.drop(columns = ['id'])
display(XX)

<div class="alert alert-success">  
</div>

In [None]:
yc = y.copy()

yc = np.clip(yc, 0, 1)
display(yc)

In [None]:
display(yc.min() , yc.max())

In [None]:
yc.value_counts().plot(figsize=(4, 4), kind='bar')

In [None]:
yc.value_counts().plot(figsize=(5, 5), kind='pie')

yc.value_counts(normalize=True)

<div class="alert alert-success">  
</div>

## Split

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.50, random_state=123) 

val_X.to_csv("val_X.csv",index=False)
val_y.to_csv("val_y.csv",index=False)

In [None]:
train_X, val_X, train_yc, val_yc = train_test_split(X, yc, test_size=0.50, random_state=123)

val_yc.to_csv("val_yc.csv",index=False)

<div class="alert alert-success">  
</div>

## Scaling

In [None]:
X_scaled = minmax_scaling(X, columns=X.columns)
# display(X_scaled)

In [None]:
XX_scaled = minmax_scaling(XX, columns=XX.columns)
# display(XX_scaled)

<div class="alert alert-success">  
</div>

## XGBRegressor

### Validation Model - 1

In [None]:
model1v = XGBRegressor(max_depth=7,
                       n_estimators=2500,
                       learning_rate=0.008,
                       subsample=0.84,
                       booster= 'gbtree',
                       tree_method= 'gpu_hist',
                       colsample_bytree= 0.70,
                       reg_lambda= 5,
                       reg_alpha= 32,
                       n_jobs= 4,  
                       alpha=0.5,
                       random_state=123)                                  
    
model1v.fit(train_X, train_y)
oof_pred1 = model1v.predict(val_X)
oof_pred1 = np.clip(oof_pred1, y.min(), y.max())

print(30 * '=')
print(f'Mean Error: {np.sqrt(mean_squared_error(val_y, oof_pred1))}')
print(30 * '=')

In [None]:
model1v.feature_importances_

## Model - 1

In [None]:
model1 = XGBRegressor(max_depth=7,                     
                      n_estimators=2500,
                      learning_rate=0.008,
                      subsample=0.84,
                      booster= 'gbtree',
                      tree_method= 'gpu_hist',
                      colsample_bytree= 0.70,
                      reg_lambda= 5,
                      reg_alpha= 32,
                      n_jobs= 4,            
                      alpha=0.5,
                      random_state=123)   

model1.fit(X, y)
pred1 = model1.predict(XX)
pred1 = np.clip(pred1, y.min(), y.max())
display(pred1, pred1.shape) 

In [None]:
sub1 = SAM.copy()

sub1.iloc[:, 1] = pred1.data
display(sub1)

In [None]:
sub1.to_csv("submission_xgb.csv",index=False)
# Public Score: 7.87965

<div class="alert alert-success">  
</div>

## CatBoostRegressor

### Validation Model - 2

In [None]:
model2v = CatBoostRegressor(depth=6,
                            iterations=1600,
                            learning_rate=0.024,
                            l2_leaf_reg=20,
                            random_strength=1.5,
                            grow_policy='Depthwise',
                            leaf_estimation_method='Newton', 
                            bootstrap_type='Bernoulli',
                            thread_count=4,
                            verbose=False,
                            loss_function='RMSE',
                            eval_metric='RMSE',
                            od_type='Iter',
                            task_type='GPU',
                            early_stopping_rounds=500,
                            random_state=123)    

model2v.fit(train_X, train_y, verbose=200)
oof_pred2 = model2v.predict(val_X)
oof_pred2 = np.clip(oof_pred2, y.min(), y.max())

print(30 * '=')
print(f'Mean Error: {np.sqrt(mean_squared_error(val_y, oof_pred2))}')
print(30 * '=')

In [None]:
model2v.feature_importances_

## Model - 2

In [None]:
model2 = CatBoostRegressor(depth=6,                     
                           iterations=1600,
                           learning_rate=0.024,
                           l2_leaf_reg=20,
                           random_strength=1.5,
                           grow_policy='Depthwise',
                           leaf_estimation_method='Newton', 
                           bootstrap_type='Bernoulli',
                           thread_count=4,
                           verbose=False,
                           loss_function='RMSE',
                           eval_metric='RMSE',
                           od_type='Iter',
                           task_type='GPU',
                           early_stopping_rounds=500,
                           random_state=123)    

model2.fit(X, y)
pred2 = model2.predict(XX)
pred2 = np.clip(pred2, y.min(), y.max())
display(pred2, pred2.shape) 

In [None]:
sub2 = SAM.copy()

sub2.iloc[:, 1] = pred2.data
display(sub2)

In [None]:
sub2.to_csv("submission_catboost.csv",index=False)
# Public Score: 7.87995

<div class="alert alert-success">  
</div>

## Feature Importances

In [None]:
a1 = model1v.feature_importances_
a2 = model2v.feature_importances_

axis_x  = X.columns.values
axis_y1 = minmax_scaling(a1, columns=[0])
axis_y2 = minmax_scaling(a2, columns=[0])

plt.style.use('seaborn-whitegrid') 
plt.figure(figsize=(16, 6), facecolor='lightgray')
plt.title(f'\nX G B o o s t  &  C a t B o o s t\n\nF e a t u r e   I m p o r t a n c e s\n', fontsize=14)  

plt.scatter(axis_x, axis_y1, s=20, label='XGBoost') 
plt.scatter(axis_x, axis_y2, s=20, label='CatBoost')

plt.legend(fontsize=12, loc=2)
plt.show()

<div class="alert alert-success">  
</div>

## Best Blend

In [None]:
def best_blend(coeff):    
    oof_pred = (oof_pred1 * coeff) + (oof_pred2 * (1.0 - coeff))   
    mae = np.sqrt(mean_squared_error(val_y, oof_pred)) 
    return mae

results = {}
for i in range(0, 11):       
    results[0.1 * i] = best_blend(0.1 * i)  
    
plt.plot(list(results.keys()), list(results.values()))
plt.show()   

In [None]:
pred = (pred1 * 0.50) + (pred2 * (1.0 - 0.50)) 

In [None]:
sub3 = SAM.copy()

sub3.iloc[:, 1] = pred.data
display(sub3)

In [None]:
sub3.to_csv("submission3.csv",index=False)
# Public Score: 7.87752

<div class="alert alert-success">  
</div>

<div class="alert alert-success">  
</div>

<div class="alert alert-success">  
</div>