<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h1><center>Importing Libraries</center></h1>
</div>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h1><center>Data Exploration</center></h1>
</div>

In [None]:
train = pd.read_csv(r'../input/tabular-playground-series-aug-2021/train.csv')
test = pd.read_csv(r'../input/tabular-playground-series-aug-2021/test.csv')
sub = pd.read_csv(r'../input/tabular-playground-series-aug-2021/sample_submission.csv')

In [None]:
train.shape, test.shape, sub.shape

In [None]:
train.head()

In [None]:
train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

In [None]:
print('train: ')
train.describe().T.style.bar(subset=['mean'], color='#606ff2')\
                            .background_gradient(subset=['std'], cmap='PuBu')\
                            .background_gradient(subset=['50%'], cmap='PuBu')

In [None]:
print('test: ')
test.describe().T.style.bar(subset=['mean'], color='#606ff2')\
                            .background_gradient(subset=['std'], cmap='PuBu')\
                            .background_gradient(subset=['50%'], cmap='PuBu')

<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h1><center>Data Visualization</center></h1>
</div>

### **Target Distribution:**

In [None]:
plt.figure(figsize=(14,5))
target_values = train['loss'].value_counts()
sns.barplot(x=target_values.index, y=target_values.values,linewidth=1.5, facecolor=(1, 1, 1, 0),
                 errcolor=".2", edgecolor=".2")
plt.title("Target unique values", fontdict={'fontsize':20})
plt.show()

In [None]:
# plot the boxplot of area distribution
plt.figure(figsize=(14,5))
sns.boxplot(train.loss,color = 'white',linewidth=2.5)
plt.title('loss Distribution')
plt.xlabel('loss')
plt.show()

### **Feature Distribution:**

In [None]:
fig = plt.figure(figsize = (15, 60))
for i in range(len(train.columns.tolist()[:100])):
    plt.subplot(20,5,i+1)
    sns.set_style("white")
    plt.title(train.columns.tolist()[:100][i], size = 12, fontname = 'monospace')
    a = sns.kdeplot(train[train.columns.tolist()[:100][i]], shade = True, alpha = 0.9, linewidth = 1.5, facecolor=(1, 1, 1, 0), edgecolor=".2")
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)
plt.show()

In [None]:
fig = plt.figure(figsize = (15, 60))
for i in range(len(train.columns.tolist()[:100])):
    plt.subplot(20,5,i+1)
    sns.set_style("white")
    plt.title(train.columns.tolist()[:100][i], size = 12, fontname = 'monospace')
    a = sns.boxplot(train[train.columns.tolist()[:100][i]], linewidth = 2.5,color = 'white')
    plt.ylabel('')
    plt.xlabel('')
    plt.xticks(fontname = 'monospace')
    plt.yticks([])
    for j in ['right', 'left', 'top']:
        a.spines[j].set_visible(False)
        a.spines['bottom'].set_linewidth(1.2)
        
fig.tight_layout(h_pad = 3)
plt.show()

<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h1><center>Data Preprocessing</center></h1>
</div>

In [None]:
y = train['loss']
train.drop('loss',axis=1,inplace=True)

In [None]:
features = []
for feature in train.columns:
    features.append(feature)
print(features)

In [None]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
train[features] = mm.fit_transform(train[features])
test[features] = mm.transform(test[features])
X = train

<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h1><center>Model Building+Optuna</center></h1>
</div>

<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h2><center>lightgbm</center></h2>
</div>

In [None]:
def fit_lgb(trial, x_train, y_train, x_test, y_test):
    params = {
        'reg_alpha' : trial.suggest_loguniform('reg_alpha' , 0.47 , 0.5),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda' , 0.32 , 0.33),
        'num_leaves' : trial.suggest_int('num_leaves' , 50 , 70),
        'learning_rate' : trial.suggest_uniform('learning_rate' , 0.03 , 0.04),
        'max_depth' : trial.suggest_int('max_depth', 30 , 40),
        'n_estimators' : trial.suggest_int('n_estimators', 100 , 6100),
        'min_child_weight' : trial.suggest_loguniform('min_child_weight', 0.015 , 0.02),
        'subsample' : trial.suggest_uniform('subsample' , 0.9 , 1.0), 
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.52 , 1),
        'min_child_samples' : trial.suggest_int('min_child_samples', 76, 80),
        'metric' : 'rmse',
        'device_type' : 'gpu',
    }
    
    
    model = LGBMRegressor(**params, random_state=2021)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict(x_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train rmse": mean_squared_error(y_train, y_train_pred,squared=False),
        "valid rmse": mean_squared_error(y_test, y_test_pred,squared=False)
    }
    
    return model, log

In [None]:
def objective(trial):
    rmse = 0
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
    model, log = fit_lgb(trial, x_train, y_train, x_test, y_test)
    rmse += log['valid rmse']
        
    return rmse

* these are the best params recovered from **Optuna**.

In [None]:
lgb_params = {'reg_alpha': 0.4972562469417825, 'reg_lambda': 0.3273637203281044, 
          'num_leaves': 50, 'learning_rate': 0.032108486615557354, 
          'max_depth': 40, 'n_estimators': 4060, 
          'min_child_weight': 0.0173353329222102,
          'subsample': 0.9493343850444064, 
          'colsample_bytree': 0.5328221263825876, 'min_child_samples': 80,'device':'gpu'}

In [None]:
def cross_val(X, y, model, params, folds=10):

    kf = KFold(n_splits=folds, shuffle=True, random_state=2021)
    for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f"Fold: {fold}")
        x_train, y_train = X.values[train_idx], y.values[train_idx]
        x_test, y_test = X.values[test_idx], y.values[test_idx]

        alg = model(**params,random_state = 2021)
        alg.fit(x_train, y_train,
                eval_set=[(x_test, y_test)],
                early_stopping_rounds=400,
                verbose=False)
        pred = alg.predict(x_test)
        error = mean_squared_error(y_test, pred,squared = False)
        print(f" mean_squared_error: {error}")
        print("-"*50)
    
    return alg

In [None]:
lgb_model = cross_val(X, y, LGBMRegressor, lgb_params)

<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h2><center>xgboost</center></h2>
</div>

In [None]:
def fit_xgb(trial, x_train, y_train, x_test, y_test):
    params = {
        'tweedie_variance_power': trial.suggest_discrete_uniform('tweedie_variance_power', 1.0, 2.0, 0.1),
        'max_depth': trial.suggest_int('max_depth', 6, 10), # Extremely prone to overfitting!
        'n_estimators': trial.suggest_int('n_estimators', 400, 4000, 400), # Extremely prone to overfitting!
        'eta': trial.suggest_float('eta', 0.007, 0.013), # Most important parameter.
        'subsample': trial.suggest_discrete_uniform('subsample', 0.2, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.2, 0.9, 0.1),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.2, 0.9, 0.1),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-4, 1e4), # I've had trouble with LB score until tuning this.
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-4, 1e4), # L2 regularization
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-4, 1e4), # L1 regularization
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e4)
    } 
    
    
    model = XGBRegressor(**params,tree_method='gpu_hist', random_state=2021)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict(x_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train rmse": mean_squared_error(y_train, y_train_pred,squared=False),
        "valid rmse": mean_squared_error(y_test, y_test_pred,squared=False)
    }
    
    return model, log

In [None]:
def objective(trial):
    rmse = 0
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
    model, log = fit_xgb(trial, x_train, y_train, x_test, y_test)
    rmse += log['valid rmse']
        
    return rmse

* these are the best params recovered from **Optuna**.

In [None]:
xgb_params = {'tweedie_variance_power': 2.0,
 'max_depth': 9,
 'n_estimators': 4000,
 'eta': 0.01200085275863839,
 'subsample': 0.8,
 'colsample_bytree': 0.7,
 'colsample_bylevel': 0.4,
 'min_child_weight': 2.824928835841522,
 'reg_lambda': 67.43522142240646,
 'reg_alpha': 0.00012103217663028774,
 'gamma': 0.012432559904494572,'tree_method':'gpu_hist'}

In [None]:
xgb_model = cross_val(X, y, XGBRegressor, xgb_params)

<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h2><center>catboost</center></h2>
</div>

In [None]:
def fit_cat(trial, x_train, y_train, x_test, y_test):
    params = {'iterations':trial.suggest_int("iterations", 1000, 20000),
              'od_wait':trial.suggest_int('od_wait', 500, 2000),
              'task_type':"GPU",
              'eval_metric':'RMSE',
              'learning_rate' : trial.suggest_uniform('learning_rate', 0.03 , 0.04),
              'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.32 , 0.33),
              'subsample': trial.suggest_uniform('subsample',0.9,1.0),
              'random_strength': trial.suggest_uniform('random_strength',10,50),
              'depth': trial.suggest_int('depth',1,15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
               }
    
    
    model = CatBoostRegressor(**params,task_type='GPU', random_state=2021)
    model.fit(x_train, y_train,eval_set=[(x_test,y_test)], early_stopping_rounds=150, verbose=False)
    
    y_train_pred = model.predict(x_train)
    
    y_test_pred = model.predict(x_test)
    y_train_pred = np.clip(y_train_pred, 0.1, None)
    y_test_pred = np.clip(y_test_pred, 0.1, None)
    
    log = {
        "train rmse": mean_squared_error(y_train, y_train_pred,squared=False),
        "valid rmse": mean_squared_error(y_test, y_test_pred,squared=False)
    }
    
    return model, log

In [None]:
def objective(trial):
    rmse = 0
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
    model, log = fit_cat(trial, x_train, y_train, x_test, y_test)
    rmse += log['valid rmse']
        
    return rmse

* these are the best params recovered from **Optuna**.

In [None]:
cat_params = {'iterations': 1224,
 'od_wait': 1243,
 'learning_rate': 0.03632022350716054,
 'reg_lambda': 0.3257139588327784,
 'subsample': 0.9741256425198503,
 'random_strength': 41.06792107841663,
 'depth': 12,
 'min_data_in_leaf': 27,
 'leaf_estimation_iterations': 10,'task_type':'GPU'}

In [None]:
cat_model = cross_val(X, y, CatBoostRegressor, cat_params)

<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h2><center>Final Voting</center></h2>
</div>

Reference:[https://www.kaggle.com/dmitryuarov/falling-below-7-87-voting-cb-xgb-lgbm](https://www.kaggle.com/dmitryuarov/falling-below-7-87-voting-cb-xgb-lgbm)

In [None]:
cat = CatBoostRegressor(**cat_params)
lgb = LGBMRegressor(**lgb_params)
xgb = XGBRegressor(**xgb_params)

In [None]:
from sklearn.ensemble import VotingRegressor
folds = KFold(n_splits = 10, random_state = 2021, shuffle = True)

predictions = np.zeros(len(test))

for fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    print(f"Fold: {fold}")
    X_train, X_val = X.values[trn_idx], X.values[val_idx]
    y_train, y_val = y.values[trn_idx], y.values[val_idx]

    model = VotingRegressor(
            estimators = [
                ('lgbm', lgb),
                ('xgb', xgb)
            ],
            weights = [0.15, 0.65]
        )
   
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    error = mean_squared_error(y_val, pred,squared = False)
    print(f" mean_squared_error: {error}")
    print("-"*50)
    
    predictions += model.predict(test) / folds.n_splits 

<div style="background-color:rgba(255, 99, 71, 0.5);">
    <h2><center>Prediction and submission</center></h2>
</div>

In [None]:
sub['loss'] = lgb_model.predict(test)
sub.to_csv(f'lgb.csv',index = False)

sub['loss'] = xgb_model.predict(test)
sub.to_csv(f'xgb.csv',index = False)

sub['loss'] = cat_model.predict(test)
sub.to_csv(f'cat.csv',index = False)

sub['loss'] = predictions
sub.to_csv(f'vote.csv',index = False)

<div class="alert alert-warning">
<h4>If you like this notebook, a upvote would be amazing :)</h4>
</div>