## Refrences: 
1. https://www.kaggle.com/abhishek/code : Optimization, Blending etc
2. https://www.kaggle.com/kingoffitpredict/time-optimization#Featuretools : Feature Tools

In [None]:
# Data Preprocessing and Model Building Libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb
import optuna
from catboost import CatBoostRegressor

# Libraries for Visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')

# Fearture Tools: For Feature Engineering
import featuretools as ft
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

# For Not displaying Warnings
import warnings
warnings.filterwarnings(action="ignore")

In [None]:
df_train = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df_train.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

f, ax = plt.subplots(nrows=4, ncols=4, figsize=(12, 12))
f.suptitle('Distribution of Numerical Features', fontsize=16)
sns.distplot(df_train['cont0'], ax=ax[0, 0])
sns.distplot(df_train['cont1'], ax=ax[0, 1])
sns.distplot(df_train['cont2'], ax=ax[0, 2])
sns.distplot(df_train['cont3'], ax=ax[0, 3])

sns.distplot(df_train['cont4'], ax=ax[1, 0])
sns.distplot(df_train['cont5'], ax=ax[1, 1])
sns.distplot(df_train['cont6'], ax=ax[1, 2])
sns.distplot(df_train['cont7'], ax=ax[1, 3])

sns.distplot(df_train['cont8'], ax=ax[2, 0])
sns.distplot(df_train['cont9'], ax=ax[2, 1])
sns.distplot(df_train['cont10'], ax=ax[2, 2])
sns.distplot(df_train['cont11'], ax=ax[2, 3])

sns.distplot(df_train['cont12'], ax=ax[3, 0])
sns.distplot(df_train['cont13'], ax=ax[3, 1])
f.delaxes(ax[3, 2])
f.delaxes(ax[3, 3])
plt.tight_layout()
plt.show();

In [None]:
f, ax = plt.subplots(nrows=3, ncols=3, figsize=(12, 12))
f.suptitle('Distribution of Categorical Features', fontsize=16)
sns.countplot(df_train['cat0'], ax=ax[0, 0])
sns.countplot(df_train['cat1'], ax=ax[0, 1])
sns.countplot(df_train['cat2'], ax=ax[0, 2])
sns.countplot(df_train['cat3'], ax=ax[1, 0])
sns.countplot(df_train['cat4'], ax=ax[1, 1])
sns.countplot(df_train['cat5'], ax=ax[1, 2])
sns.countplot(df_train['cat6'], ax=ax[2, 0])
sns.countplot(df_train['cat7'], ax=ax[2, 1])
sns.countplot(df_train['cat8'], ax=ax[2, 2])

plt.tight_layout()
plt.show();

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

######## For Training set
# initial set up of FeatureTool
es = ft.EntitySet(id = 'data')
es.entity_from_dataframe(entity_id = 'original_train_data', 
                         dataframe = df[numerical_cols], 
                         index='id') 

#creating separate dataframe with feature Engineered columns
feature_matrix_train, feature_defs_train = ft.dfs(entityset = es,                                          
                                      target_entity = 'original_train_data',                               
                                      trans_primitives = ['add_numeric', 'multiply_numeric'],  
                                      verbose=1)  

# adding new features into original dataframe
for i in feature_matrix_train.iloc[:,14:].columns:
    df[i] = feature_matrix_train[i]


######## Using same technique on Test Set

# initial set up of FeatureTool
es = ft.EntitySet(id = 'data')
es.entity_from_dataframe(entity_id = 'original_test_data', 
                         dataframe = df_test[numerical_cols], 
                         index='id') 

#creating separate dataframe with feature Engineered columns
feature_matrix_test, feature_defs_test = ft.dfs(entityset = es,                                          
                                      target_entity = 'original_test_data',                               
                                      trans_primitives = ['add_numeric', 'multiply_numeric'],  
                                      verbose=1)  

# adding new features into original dataframe
for i in feature_matrix_test.iloc[:,14:].columns:
    df_test[i] = feature_matrix_test[i]

#adding new features to numerical columns list
numerical_cols = list(feature_matrix_train.columns)
useful_features = object_cols + numerical_cols

# Model 1: XGB using FeatureTool
final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])    


    model = XGBRegressor(
        random_state=fold,
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=11000,
        learning_rate=0.03875678489649957,
        reg_lambda=0.000899770960119882,
        reg_alpha=0.00026578249068599785,
        subsample=0.9532248864902615,
        colsample_bytree=0.16561394428070153,
        max_depth=3,
    )
    
    
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_XGB"]
final_valid_predictions.to_csv("train_pred_XGB.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_XGB"]
sample_submission.to_csv("test_pred_XGB.csv", index=False)

# **Model 2: Light GBM**

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]


final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    param = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "n_estimators": 10000,
        "early_stopping_round": 300,
        "device": "gpu",
        "gpu_platform_id": 0,
        "gpu_device_id": 0,
    }
    
    param2 = {
        'lambda_l1': 0.00472279780583036, 
        'lambda_l2': 2.9095205689488508e-05, 
        'num_leaves': 158, 
        'feature_fraction': 0.7386878356648194, 
        'bagging_fraction': 0.8459744550725283, 
        'bagging_freq': 2, 
        'max_depth': 2, 
        'max_bin': 249, 
        'learning_rate': 0.044738463593017294,
        'min_child_samples': 13
    }
    param.update(param2)
    
    lgb_train = lgb.Dataset(xtrain, ytrain)
    lgb_valid = lgb.Dataset(xvalid, yvalid, reference=lgb_train)

    model = lgb.train(param, lgb_train, valid_sets=[lgb_valid], verbose_eval=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)
    
    
print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_LGB"]
final_valid_predictions.to_csv("train_pred_LGB.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_LGB"]
sample_submission.to_csv("test_pred_LGB.csv", index=False)

# **Model 3: XGB + Target Encoding**

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]


final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {'learning_rate': 0.07853392035787837, 'reg_lambda': 1.7549293092194938e-05,
              'reg_alpha': 14.68267919457715, 'subsample': 0.8031450486786944, 'colsample_bytree': 0.170759104940733, 
              'max_depth': 3}
    
    model = XGBRegressor(
        random_state=0, 
        tree_method='gpu_hist',
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=5000,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_XGB1"]
final_valid_predictions.to_csv("train_pred_XGB1.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_XGB1"]
sample_submission.to_csv("test_pred_XGB1.csv", index=False)

# **XGB3: Ordinal Encoding**



In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 10000,
        'learning_rate': 0.03628302216953097,
        'reg_lambda': 0.0008746338866473539,
        'reg_alpha': 23.13181079976304,
        'subsample': 0.7875490025178415,
        'colsample_bytree': 0.11807135201147481,
        'max_depth': 3
    }
    
    model = XGBRegressor(
        n_jobs=-1,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_XGB3"]
final_valid_predictions.to_csv("train_pred_XGB3.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_XGB3"]
sample_submission.to_csv("test_pred_XGB3.csv", index=False)

# **CatBoost using Target Encoding**
### **Model Tuning**

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]  

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]
    
def run(trial):
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        xtest = df_test.copy()

        ytrain = xtrain['target']
        yvalid = xvalid['target']

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        ordinal_encoder = preprocessing.OrdinalEncoder()
        xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
        xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
        
        
        cat_parameters_1 = {'iterations':trial.suggest_int("iterations", 5000, 8000),
             'learning_rate':trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
                            'l2_leaf_reg':trial.suggest_int("l2_leaf_reg", 5, 200),
             'random_strength':trial.suggest_float("random_strength", 0.1, 5),'grow_policy':'Depthwise',
                        'leaf_estimation_method':'Newton', 'od_type':'Iter',
             'bootstrap_type':'Bayesian','thread_count':-1,'verbose':False,'loss_function':'RMSE','eval_metric':'RMSE'}
        
        
        
        
        model = CatBoostRegressor(**cat_parameters_1, task_type = 'GPU')
        model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        
        preds_valid = model.predict(xvalid)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
        
    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

study.best_params

In [None]:
# Plotting Important Parameters
plot_param_importances(study)

In [None]:
# Plotting history of Study
plot_optimization_history(study)

### **Model building using tuned Parameters**

In [None]:
# CAtBoost

df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]


for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]


final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    
    params = {'od_type':'Iter','iterations': 5376,
 'learning_rate': 0.023320327820924816,
 'l2_leaf_reg': 181,
 'random_strength': 3.391745124598448,'grow_policy':'Lossguide',
                        'leaf_estimation_method':'Newton', 
             'bootstrap_type':'Bayesian','thread_count':-1,'verbose':False,'loss_function':'RMSE','eval_metric':'RMSE'}

    
    model = CatBoostRegressor(**params, task_type = 'GPU')
    model.fit(xtrain, ytrain, verbose =500)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_CatB"]
final_valid_predictions.to_csv("train_pred_CatB.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_CatB"]
sample_submission.to_csv("test_pred_CatB.csv", index=False)


# **Blending All Models**

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

df1 = pd.read_csv("./train_pred_XGB.csv")
df2 = pd.read_csv("./train_pred_LGB.csv")
df3 = pd.read_csv("./train_pred_XGB1.csv")
df4 = pd.read_csv("./train_pred_XGB3.csv")
df5 = pd.read_csv("./train_pred_CatB.csv")


df_test1 = pd.read_csv("./test_pred_XGB.csv")
df_test2 = pd.read_csv("./test_pred_LGB.csv")
df_test3 = pd.read_csv("./test_pred_XGB1.csv")
df_test4 = pd.read_csv("./test_pred_XGB3.csv")
df_test5 = pd.read_csv("./test_pred_CatB.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")
df = df.merge(df4, on="id", how="left")
df = df.merge(df5, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")
df_test = df_test.merge(df_test4, on="id", how="left")
df_test = df_test.merge(df_test5, on="id", how="left")

df.head()

# **Final Model 1: Tuning XGB**

In [None]:
useful_features = ["pred_XGB","pred_XGB3", "pred_XGB1", "pred_LGB","pred_CatB"] 
    
def run(trial):
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        xtest = df_test.copy()

        ytrain = xtrain['target']
        yvalid = xvalid['target']

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        
        learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
        reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
        reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        max_depth = trial.suggest_int("max_depth", 1, 7)
        
        model = XGBRegressor(
        random_state=fold,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=12000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth)
        
        
        model.fit(xtrain, ytrain, early_stopping_rounds=300,eval_set=[(xvalid, yvalid)],  verbose=1000)
        
        preds_valid = model.predict(xvalid)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
        
    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

study.best_params

In [None]:
# Plotting Important Parameters
plot_param_importances(study)

In [None]:
# Plotting history of Study
plot_optimization_history(study)

# **Final Model 2: Tuning CatBoost**

In [None]:
useful_features = ["pred_XGB","pred_XGB3", "pred_XGB1", "pred_LGB","pred_CatB"]
   
    
def run(trial):
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        xtest = df_test.copy()

        ytrain = xtrain['target']
        yvalid = xvalid['target']

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]
        
        
        cat_parameters_1 = {'iterations':trial.suggest_int("iterations", 5000, 8000),
             'learning_rate':trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
                            'l2_leaf_reg':trial.suggest_int("l2_leaf_reg", 5, 200),
             'random_strength':trial.suggest_float("random_strength", 0.1, 5),'grow_policy':'Depthwise',
                        'leaf_estimation_method':'Newton', 'od_type':'Iter',
             'bootstrap_type':'Bayesian','thread_count':-1,'verbose':False,'loss_function':'RMSE','eval_metric':'RMSE'}
        
        
        
        
        model = CatBoostRegressor(**cat_parameters_1)
        model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
        
        preds_valid = model.predict(xvalid)
        rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(run, n_trials=10)

study.best_params

In [None]:
# Plotting Important Parameters
plot_param_importances(study)

In [None]:
# Plotting history of Study
plot_optimization_history(study)

# **Final Model XGB**

In [None]:
useful_features = ["pred_XGB","pred_XGB1", "pred_XGB3", "pred_LGB"]
df_test = df_test[useful_features]


final_test_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
        
    model = XGBRegressor(
        random_state=fold,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=12000,
        learning_rate=0.024882747879237756,
        reg_lambda= 0.000827524788778563,
        reg_alpha=8.633187860723039e-07,
        subsample=0.16151014613960882,
        colsample_bytree=0.6057298386505088,
        max_depth=1)
        
    model.fit(xtrain, ytrain, early_stopping_rounds=300,eval_set=[(xvalid, yvalid)],  verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)
    
print(np.mean(scores), np.std(scores))

# **Final Model CatBoost**

In [None]:
useful_features = ["pred_XGB","pred_XGB1", "pred_XGB3", "pred_LGB"]
df_test = df_test[useful_features]


final_test_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]    
    
    cat_parameters_1 = {'iterations': 6487,
 'learning_rate': 0.040692021622714805,
 'l2_leaf_reg': 99,
 'random_strength': 2.0689175540262017,'grow_policy':'Depthwise',
                    'leaf_estimation_method':'Newton', 'od_type':'Iter',
         'bootstrap_type':'Bayesian','thread_count':-1,'verbose':False,'loss_function':'RMSE','eval_metric':'RMSE'}

    model = CatBoostRegressor(**cat_parameters_1)
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)
    
print(np.mean(scores), np.std(scores))

# **Select the best of two**

In [None]:
sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)

# **Thank you**