<div>
    <h1 align="center">Tabular Playground Series - Jul 2021</h1>
    <h1 align="center">XGBoost & LeaveOneGroupOut & Ensembling</h1>
    <h4 align="center">By: Somayyeh Gholami & Mehran Kazeminia</h4>
</div>

<div class="alert alert-success">  
</div>

<div class="alert alert-success">
    <h1 align="center">If you find this work useful, please don't forget upvoting :)</h1>
</div>

## Import

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneGroupOut

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

<div class="alert alert-success">  
</div>

## Data Set

In [None]:
DF1 = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')

DF2 = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

SAM = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

In [None]:
MV1 = DF1.isnull().sum()
MV2 = DF2.isnull().sum()

print(f'Missing Value 1:  {MV1[MV1 > 0]}')
print(f'Missing Value 2:  {MV2[MV2 > 0]}')

In [None]:
display(DF1, DF2)
# display(DF1.info(), DF2.info())
# display(DF1.describe().transpose())
# display(DF2.describe().transpose())

<div class="alert alert-success">  
</div>

In [None]:
data1 = DF1.copy()
data2 = DF2.copy()

In [None]:
X = data1.drop(columns = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'])
X['date_time'] = X['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9

display(X)

In [None]:
y1 = data1.target_carbon_monoxide
y2 = data1.target_benzene
y3 = data1.target_nitrogen_oxides
# display(y1, y2, y3)

In [None]:
XX = data2.copy()
XX['date_time'] = XX['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9

display(XX)

<div class="alert alert-success">  
</div>

## Split

In [None]:
train_X, val_X, train_y1, val_y1 = train_test_split(X, y1, test_size=0.50, random_state=123)
train_X, val_X, train_y2, val_y2 = train_test_split(X, y2, test_size=0.50, random_state=123)
train_X, val_X, train_y3, val_y3 = train_test_split(X, y3, test_size=0.50, random_state=123)

In [None]:
val_X.to_csv("val_X.csv",index=False)

val_y1.to_csv("val_y1.csv",index=False)
val_y2.to_csv("val_y2.csv",index=False)
val_y3.to_csv("val_y3.csv",index=False)

<div class="alert alert-success">
    <h1 align="center">XGBRegressor</h1>
</div>

## Validation Model - 1 

### [ target_carbon_monoxide ]

In [None]:
model1v = XGBRegressor(max_depth=6,
                       n_estimators=250,
                       learning_rate=0.08,
                       subsample=0.7,
                       alpha=0.5,
                       random_state=123)                           
        
model1v.fit(train_X, train_y1, verbose=100)
oof_pred1 = model1v.predict(val_X)

oof_pred1 = np.clip(oof_pred1, 0.30, y1.max())
# oof_pred1 = np.clip(oof_pred1, y1.min(), y1.max())

print(40 * '=')
print(f'Mean Absolute Error: {mean_absolute_error(val_y1, oof_pred1)}')
print(40 * '=')

In [None]:
model1v.feature_importances_

<div class="alert alert-success">  
</div>

## Validation Model - 2 

### [ target_benzene ]

In [None]:
model2v = XGBRegressor(max_depth=6,
                       n_estimators=400,
                       learning_rate=0.07,
                       subsample=0.7,
                       alpha=0.7,
                       random_state=123)          

model2v.fit(train_X, train_y2, verbose=100)
oof_pred2 = model2v.predict(val_X)

oof_pred2 = np.clip(oof_pred2, 0.10, y2.max())
# oof_pred2 = np.clip(oof_pred2, y1.max(), y2.max())

print(40 * '=')
print(f'Mean Absolute Error: {mean_absolute_error(val_y2, oof_pred2)}')
print(40 * '=')

In [None]:
model2v.feature_importances_

<div class="alert alert-success">  
</div>

## Validation Model - 3 

### [ target_nitrogen_oxides ]

In [None]:
model3v = XGBRegressor(max_depth=8,
                       n_estimators=500,
                       learning_rate=0.03,
                       subsample=0.7,
                       alpha=0.8,
                       random_state=123)                           

model3v.fit(train_X, train_y3, verbose=100)
oof_pred3 = model3v.predict(val_X)

oof_pred3 = np.clip(oof_pred3, 20.0, y3.max())
# oof_pred3 = np.clip(oof_pred3, y3.min(), y3.max())

print(40 * '=')
print(f'Mean Absolute Error: {mean_absolute_error(val_y3, oof_pred3)}')
print(40 * '=')

In [None]:
model3v.feature_importances_

<div class="alert alert-success">  
</div>

## Feature Importances

In [None]:
axis_x  = X.columns.values
axis_y1 = model1v.feature_importances_
axis_y2 = model2v.feature_importances_
axis_y3 = model3v.feature_importances_

plt.style.use('seaborn-whitegrid') 
plt.figure(figsize=(16, 6), facecolor='lightgray')
plt.title(f'\nX G B o o s t\n\nF e a t u r e   I m p o r t a n c e s\n', fontsize=14)  

plt.scatter(axis_x, axis_y1, s=120, label='target_carbon_monoxide') 
plt.scatter(axis_x, axis_y2, s=120, label='target_benzene')
plt.scatter(axis_x, axis_y3, s=120, label='target_nitrogen_oxides')
plt.legend(fontsize=12, loc=2)
plt.show() 

<div class="alert alert-success">  
</div>

## Model - 1 

### [ target_carbon_monoxide ]

In [None]:
model1 = XGBRegressor(max_depth=6,
                      n_estimators=250,
                      learning_rate=0.08,
                      subsample=0.7,
                      alpha=0.5,
                      random_state=123)                         

model1.fit(X, y1)
pred1 = model1.predict(XX)
pred1 = np.clip(pred1, 0.30, y1.max())
display(pred1, pred1.shape) 

<div class="alert alert-success">  
</div>

## Model - 2 

### [ target_benzene ]

In [None]:
model2 = XGBRegressor(max_depth=6,                     
                      n_estimators=400,
                      learning_rate=0.07,
                      subsample=0.7,
                      alpha=0.7,
                      random_state=123)        

model2.fit(X, y2)
pred2 = model2.predict(XX)
pred2 = np.clip(pred2, 0.10, y2.max())
display(pred2, pred2.shape) 

<div class="alert alert-success">  
</div>

## Model - 3 

### [ target_nitrogen_oxides ]

In [None]:
model3 = XGBRegressor(max_depth=8,
                      n_estimators=500,
                      learning_rate=0.03,
                      subsample=0.7,
                      alpha=0.8,
                      random_state=123)                           

model3.fit(X, y3)
pred3 = model3.predict(XX)
pred3 = np.clip(pred3, 20.0, y3.max())
display(pred3, pred3.shape) 

<div class="alert alert-success">  
</div>

In [None]:
sub_xgb = SAM.copy()

sub_xgb['target_carbon_monoxide'] = pred1
sub_xgb['target_benzene'] = pred2
sub_xgb['target_nitrogen_oxides'] = pred3
display(sub_xgb)

In [None]:
sub = sub_xgb
sub.to_csv("submission_xgb.csv",index=False)
# Public Score: 0.23087 
!ls

<div class="alert alert-success">
    <h1 align="center">LeaveOneGroupOut</h1>
</div>

## Data Augmentation

In [None]:
months1 = []
for i in range(len(data1)):  
    
    row  = data1.iloc[i,0]    
    mon  = int(row[5:7])
    #day = int(row[8:10])
    #hou = int(row[11:13])    
    if (mon == 1): mon=12  
    months1.append(mon)   
    
data1['months'] = months1
display(data1)    

In [None]:
months2 = []
for i in range(len(data2)):  
    
    row  = data2.iloc[i,0]    
    mon  = int(row[5:7])
    #day = int(row[8:10])
    #hou = int(row[11:13])  
    months2.append(mon)   
    
data2['months'] = months2
display(data2)    

<div class="alert alert-success">  
</div>

In [None]:
groups = data1['months']
display(groups)

In [None]:
logo = LeaveOneGroupOut()

print(logo.get_n_splits(X, y1, groups))
print(logo.get_n_splits(X, y2, groups))
print(logo.get_n_splits(X, y3, groups))

In [None]:
for train_index, test_index in logo.split(X, y1, groups):
              
    print(f'Train index:\n{train_index}')    
    print(f'\nTest index:\n{test_index}')
    print(70 * '=') 

<div class="alert alert-success">  
</div>

In [None]:
pred1_leave = np.zeros(len(XX))
for train_index, test_index in logo.split(X, y1, groups):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]   
    y_train, y_test = y1.iloc[train_index], y1.iloc[test_index]
    
    model1.fit(X_train, y_train.ravel())  
    pred1_leave += (model1.predict(XX)) / 10

display(pred1_leave, pred1_leave.shape) 

In [None]:
pred2_leave = np.zeros(len(XX))
for train_index, test_index in logo.split(X, y2, groups):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]   
    y_train, y_test = y2.iloc[train_index], y2.iloc[test_index]
    
    model2.fit(X_train, y_train.ravel())  
    pred2_leave += (model2.predict(XX)) / 10

display(pred2_leave, pred2_leave.shape) 

In [None]:
pred3_leave = np.zeros(len(XX))
for train_index, test_index in logo.split(X, y3, groups):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]   
    y_train, y_test = y3.iloc[train_index], y3.iloc[test_index]
    
    model3.fit(X_train, y_train.ravel())  
    pred3_leave += (model3.predict(XX)) / 10

display(pred3_leave, pred3_leave.shape) 

<div class="alert alert-success">  
</div>

In [None]:
sub_leave = SAM.copy()

sub_leave['target_carbon_monoxide'] = pred1_leave
sub_leave['target_benzene'] = pred2_leave
sub_leave['target_nitrogen_oxides'] = pred3_leave
display(sub_leave)

In [None]:
sub = sub_leave
sub.to_csv("submission_leave.csv",index=False)
# Public Score: 0.22736
!ls

<div class="alert alert-success">
    <h1 align="center">Ensembling</h1>
</div>

In [None]:
def ensembling(main, support, coeff1, coeff2, coeff3): 
    
    suba  = main.copy() 
    subav = suba.values
       
    subb  = support.copy()
    subbv = subb.values    
           
    ense  = main.copy()    
    ensev = ense.values  
 
    for i in range (len(main)):
        
        pera1 = subav[i, 1]
        pera2 = subav[i, 2]
        pera3 = subav[i, 3]
        
        perb1 = subbv[i, 1]
        perb2 = subbv[i, 2]
        perb3 = subbv[i, 3]

        per1 = (pera1 * coeff1) + (perb1 * (1.0 - coeff1))
        per2 = (pera2 * coeff2) + (perb2 * (1.0 - coeff2))
        per3 = (pera3 * coeff3) + (perb3 * (1.0 - coeff3))
        
        ensev[i, 1] = per1
        ensev[i, 2] = per2
        ensev[i, 3] = per3
        
    ense.iloc[:, 1:] = ensev[:, 1:] 
    
    ###############################    
    X  = suba.iloc[:, 1]
    Y1 = subb.iloc[:, 1]
    Y2 = ense.iloc[:, 1]
    
    plt.style.use('seaborn-whitegrid') 
    plt.figure(figsize=(9, 9), facecolor='lightgray')
    plt.title(f'\nP R E D I C T  1\n\ntarget_carbon_monoxide\n')   
    
    
    plt.scatter(X, Y1, s=2.0, label='Support')    
    plt.scatter(X, Y2, s=2.0, label='Generated')
    plt.scatter(X, X , s=0.1, label='Main(X=Y)')
    
    plt.legend(fontsize=12, loc=2)
    plt.show()     
    ###############################      
    X  = suba.iloc[:, 2]
    Y1 = subb.iloc[:, 2]
    Y2 = ense.iloc[:, 2]
    
    plt.style.use('seaborn-whitegrid') 
    plt.figure(figsize=(9, 9), facecolor='lightgray')
    plt.title(f'\nP R E D I C T  2\n\ntarget_benzene\n')   
    
    
    plt.scatter(X, Y1, s=2.0, label='Support')    
    plt.scatter(X, Y2, s=2.0, label='Generated')
    plt.scatter(X, X , s=0.1, label='Main(X=Y)')
    
    plt.legend(fontsize=12, loc=2)
    plt.show()     
    ############################### 
    X  = suba.iloc[:, 3]
    Y1 = subb.iloc[:, 3]
    Y2 = ense.iloc[:, 3]
    
    plt.style.use('seaborn-whitegrid') 
    plt.figure(figsize=(9, 9), facecolor='lightgray')
    plt.title(f'\nP R E D I C T  3\n\ntarget_nitrogen_oxides\n')   
    
    
    plt.scatter(X, Y1, s=2.0, label='Support')    
    plt.scatter(X, Y2, s=2.0, label='Generated')
    plt.scatter(X, X , s=0.1, label='Main(X=Y)')
    
    plt.legend(fontsize=12, loc=2)
    plt.show()     
    ############################### 
    
    return ense      


<div class="alert alert-success">  
</div>

Thanks to: @paddykb https://www.kaggle.com/paddykb/tps-07-gam-baseline 

In [None]:
sub21744 = pd.read_csv('../input/tps7-21744/submission_gam.csv')

sub_ense = ensembling(sub21744, sub_leave, 0.65, 0.55, 0.75)

In [None]:
sub = sub_ense
sub.to_csv("submission_ense.csv",index=False)
# Public Score: 
!ls

<div class="alert alert-success">  
</div>

<div class="alert alert-success">  
</div>