In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn import preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
import sklearn
import warnings
warnings.filterwarnings(action="ignore")

# **Importing Blended Files Generated from :**
## https://www.kaggle.com/snikhil17/tuning-blending-of-xgb-catboost-lightgbm/edit

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

df1 = pd.read_csv("../input/fin-stack/train_pred_XGB.csv")
df1.columns = ["id", "pred_1"]
df2 = pd.read_csv("../input/blending-n/train_pred_XGB1(1).csv")
df2.columns = ["id", "pred_2"]
df3 = pd.read_csv("../input/fin-stack/train_pred_XGB3.csv")
df3.columns = ["id", "pred_3"]
df4 = pd.read_csv("../input/blending-n/train_pred_LGB(1).csv")
df4.columns = ["id", "pred_4"]
df5 = pd.read_csv("../input/catboost-nik/train_pred_CatB.csv")
df5.columns = ["id", "pred_5"]

df_test1 = pd.read_csv("../input/fin-stack/test_pred_XGB.csv")
df_test1.columns = ["id", "pred_1"]
df_test2 = pd.read_csv("../input/blending-n/test_pred_XGB1(1).csv")
df_test2.columns = ["id", "pred_2"]
df_test3 = pd.read_csv("../input/fin-stack/test_pred_XGB3.csv")
df_test3.columns = ["id", "pred_3"]
df_test4 = pd.read_csv("../input/blending-n/test_pred_LGB(1).csv")
df_test4.columns = ["id", "pred_4"]
df_test5 = pd.read_csv("../input/catboost-nik/test_pred_CatB.csv")
df_test5.columns = ["id", "pred_5"]

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")
df = df.merge(df4, on="id", how="left")
df = df.merge(df5, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")
df_test = df_test.merge(df_test4, on="id", how="left")
df_test = df_test.merge(df_test5, on="id", how="left")

df.head()

# **HyperParamerter Tuning of XGBRegressor (Model 1)**

In [None]:
useful_features = ["pred_2", "pred_3", "pred_4", "pred_1","pred_5"]
df_test = df_test[useful_features]



def run(trial):
    
    
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        xtest = df_test.copy()

        ytrain = xtrain.target
        yvalid = xvalid.target

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 7)
        
        model = XGBRegressor(
        random_state=fold,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=9000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth)
        
        
        model.fit(xtrain, ytrain, early_stopping_rounds=300,eval_set=[(xvalid, yvalid)],  verbose=1000)

        preds_valid = model.predict(xvalid)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
        
    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

study.best_params

In [None]:
useful_features = ["pred_2", "pred_3", "pred_4", "pred_1", "pred_5"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    

    params = {'learning_rate': 0.031354783240695655,
 'reg_lambda': 0.004484898434507873,
 'reg_alpha': 3.429535691214872e-07,
 'subsample': 0.14371213599791083,
 'colsample_bytree': 0.562816437109489,
 'max_depth': 2}
    
    model = XGBRegressor(
        n_jobs=-1,
        
        random_state=fold,
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=9000,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("level1_train_pred_1.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_1"]
sample_submission.to_csv("level1_test_pred_1.csv", index=False)

# **Model 2 tuning (RandomForestRegressor)**

In [None]:
useful_features = ["pred_2", "pred_3", "pred_4", "pred_1", "pred_5"]


def run(trial):
    
    
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        xtest = df_test.copy()

        ytrain = xtrain.target
        yvalid = xvalid.target

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        
        params = {'n_estimators':trial.suggest_int("n_estimators", 100,105),
              'min_weight_fraction_leaf':trial.suggest_float("min_weight_fraction_leaf", 0.01,0.05),
                        'min_samples_leaf':trial.suggest_int("min_samples_leaf", 2,7),
                        'max_depth': trial.suggest_int("max_depth", 55, 65),
                        'min_samples_split': trial.suggest_int("min_samples_split", 10,15),
                        'n_jobs':-1,
                        'max_features':'sqrt',
                        'oob_score':False,
                        'verbose':False,
                        'random_state':7, 
                        'warm_start':False, 'bootstrap':True,
                        'max_leaf_nodes' : None,
                        'min_impurity_split':None}
        
        
        
        
        model = RandomForestRegressor(**params)
        model.fit(xtrain, ytrain)

        preds_valid = model.predict(xvalid)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
        
    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

study.best_params

# **Model 2: RandomForestRegressor**

In [None]:
useful_features = ["pred_2", "pred_3", "pred_4", "pred_1", "pred_5"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    params = {'n_estimators': 102,
 'min_weight_fraction_leaf': 0.014050550614029426,
 'min_samples_leaf': 3,
 'max_depth': 64,
 'min_samples_split': 13,'n_jobs':-1,
                        'max_features':'sqrt',
                        'oob_score':False,
                        'verbose':False,
                        'random_state':7, 
                        'warm_start':False, 'bootstrap':True,
                        'max_leaf_nodes' : None,
                        'min_impurity_split':None}
    
    model = RandomForestRegressor(**params)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("level1_train_pred_2.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_2"]
sample_submission.to_csv("level1_test_pred_2.csv", index=False)

# **Tuning 3rd Model**

In [None]:
useful_features = ["pred_2", "pred_3", "pred_4", "pred_1", "pred_5"]



def run(trial):
    
    
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        xtest = df_test.copy()

        ytrain = xtrain.target
        yvalid = xvalid.target

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        
        cat_parameters_1 = {'iterations':trial.suggest_int("iterations", 5000, 8000),
             'learning_rate':trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
                            'l2_leaf_reg':trial.suggest_int("l2_leaf_reg", 5, 200),
             'random_strength':trial.suggest_float("random_strength", 0.1, 5),'grow_policy':'Depthwise',
                        'leaf_estimation_method':'Newton', 'od_type':'Iter',
             'bootstrap_type':'Bayesian','thread_count':-1,'verbose':False,'loss_function':'RMSE','eval_metric':'RMSE'}
        
        
        
        
        model = CatBoostRegressor(**cat_parameters_1,task_type = "GPU")
        model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)

        preds_valid = model.predict(xvalid)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

study.best_params

# **Model 3: CatBoost**

In [None]:
useful_features = ["pred_2", "pred_3", "pred_4", "pred_1", "pred_5"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    params = {'iterations': 7126,
 'learning_rate': 0.2433541796869192,
 'l2_leaf_reg': 177,
 'random_strength': 4.029534673251209,'grow_policy':'Depthwise',
                        'leaf_estimation_method':'Newton', 'od_type':'Iter',
             'bootstrap_type':'Bayesian','thread_count':-1,'verbose':False,'loss_function':'RMSE','eval_metric':'RMSE'}
    
    model = CatBoostRegressor(**params, task_type = "GPU")    
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("level1_train_pred_3.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_3"]
sample_submission.to_csv("level1_test_pred_3.csv", index=False)

# Final Model

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

df1 = pd.read_csv("./level1_train_pred_1.csv")
df2 = pd.read_csv("./level1_train_pred_2.csv")
df3 = pd.read_csv("./level1_train_pred_3.csv")

df_test1 = pd.read_csv("./level1_test_pred_1.csv")
df_test2 = pd.read_csv("./level1_test_pred_2.csv")
df_test3 = pd.read_csv("./level1_test_pred_3.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")

df.head()

# Hpertuning Final Model

In [None]:
useful_features = ["pred_2", "pred_3", "pred_1"]

def run(trial):
    
    
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        xtest = df_test.copy()

        ytrain = xtrain.target
        yvalid = xvalid.target

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]        
        
        cat_parameters_1 = {'iterations':trial.suggest_int("iterations", 5000, 8000),
             'learning_rate':trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
                            'l2_leaf_reg':trial.suggest_int("l2_leaf_reg", 5, 200),
             'random_strength':trial.suggest_float("random_strength", 0.1, 5),'grow_policy':'Depthwise',
                        'leaf_estimation_method':'Newton', 'od_type':'Iter',
             'bootstrap_type':'Bayesian','thread_count':-1,'verbose':False,'loss_function':'RMSE','eval_metric':'RMSE'}
        
        model = CatBoostRegressor(**cat_parameters_1, task_type = "GPU")
        model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)

        preds_valid = model.predict(xvalid)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
        
    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

study.best_params

In [None]:
useful_features = ["pred_1", "pred_2", "pred_3"]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    cat_parameters_1 = {'iterations': 5958,
 'learning_rate': 0.05235006201456179,
 'l2_leaf_reg': 140,
 'random_strength': 2.2544196494134883,
                        'grow_policy':'Depthwise',
                        'leaf_estimation_method':'Newton', 'od_type':'Iter',
             'bootstrap_type':'Bayesian','thread_count':-1,'verbose':False,'loss_function':'RMSE','eval_metric':'RMSE'}
        
        
        
        
    model = CatBoostRegressor(**cat_parameters_1, task_type = "GPU")
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)