In [None]:
#!pip install category-encoders

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import optuna 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import MEstimateEncoder, TargetEncoder
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, ElasticNet

In [None]:
df = pd.read_csv('../input/30days-10folds/train_10_folds.csv')
df_test = pd.read_csv('../input/30-days-of-ml/test.csv')
df_submission = pd.read_csv('../input/30-days-of-ml/sample_submission.csv')

# categorical columns
cat_cols = [c for c in df.columns if c.startswith('cat')]

# numerical columns apart from 'id', 'target' and 'kfold'
num_cols = [c for c in df.columns if c.startswith('cont')]

useful_features = [c for c in df.columns if c not in ['id', 'target', 'kfold']]

df_ohe = pd.get_dummies(df, columns=cat_cols)
df_test_ohe = pd.get_dummies(df_test, columns=cat_cols)

cat_cols2 = [c for c in df_ohe.columns if c.startswith('cat')]
useful_features2 = [c for c in df_ohe.columns if c not in ['id', 'target', 'kfold']]

In [None]:
N_FOLDS = 10

## XGBRegressor Hyperparameter Tuning

In [None]:
def run_xgbr(trial):
    fold = 6
    
    xgbr_params = {
        'random_state': trial.suggest_int("random_state", 1, 39501, step=500),
        'n_estimators': trial.suggest_int("n_estimators", 500, 40000, step=500),
        'learning_rate': trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
        'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-8, 100.0),
        'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-8, 100.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1.0),
        'subsample': trial.suggest_float("subsample", 0.1, 1.0),
        'max_depth': trial.suggest_int("max_depth", 1, 15),
    }
    
    X_train = df_ohe[df_ohe.kfold != fold][useful_features2].copy()
    y_train = df_ohe[df_ohe.kfold != fold]['target'].copy()
    
    X_valid = df_ohe[df_ohe.kfold == fold][useful_features2].copy()
    y_valid = df_ohe[df_ohe.kfold == fold]['target'].copy()


#     oe = MEstimateEncoder()

#     X_train[cat_cols] = oe.fit_transform(X_train[cat_cols], y_train)
#     X_valid[cat_cols] = oe.transform(X_valid[cat_cols])

    xgbr = XGBRegressor(
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_jobs=-1,
        **xgbr_params
    )

    xgbr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    valid_pred = xgbr.predict(X_valid)
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run_xgbr, n_trials=30)

In [None]:
study.best_params

## LightGBMRegressor Hyperparameter Tuning

In [None]:
def run_lgbmr(trial):
    fold = 6

    lgbmr_params = {
        'random_state': trial.suggest_int("random_state", 1, 39501, step=500),
        'metric': 'rmse',
        'objective': 'regression',
        'n_estimators': trial.suggest_int("n_estimators", 500, 30000, step=500),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1.0),
        'subsample': trial.suggest_float("subsample", 0.1, 1.0),
        'learning_rate': trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
        'max_depth': trial.suggest_int("max_depth", 1, 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('cat_smooth', 1, 100)
    }

    X_train = df[df.kfold != fold][useful_features].copy()
    y_train = df[df.kfold != fold]['target'].copy()
    
    X_valid = df[df.kfold == fold][useful_features].copy()
    y_valid = df[df.kfold == fold]['target'].copy()
    
    
    oe = OrdinalEncoder()
    
    X_train[cat_cols] = oe.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = oe.transform(X_valid[cat_cols])
    
    lgbmr = LGBMRegressor(
        device = 'gpu',
        gpu_platform_id = 0,
        gpu_device_id = 0,
        n_jobs=-1,
        **lgbmr_params
    )
    
    lgbmr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    valid_pred = lgbmr.predict(X_valid)
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run_lgbmr, n_trials=30)

In [None]:
study.best_params

## XGBRegressor Predictions

### OneHotEncoder

In [None]:
valid_preds = {}
test_preds = []
rmse_scores = []

# Optimized params
xgbr_params = {'random_state': 1,
 'n_estimators': 24000,
 'learning_rate': 0.010570184115565436,
 'reg_alpha': 0.011885501879436232,
 'reg_lambda': 83.0988767019388,
 'colsample_bytree': 0.10582207063540948,
 'subsample': 0.6505186664831886,
 'max_depth': 7}

for fold in range(N_FOLDS):
    X_train = df_ohe[df_ohe.kfold != fold][useful_features2].copy()
    y_train = df_ohe[df_ohe.kfold != fold]['target'].copy()
    
    X_valid = df_ohe[df_ohe.kfold == fold][useful_features2].copy()
    y_valid = df_ohe[df_ohe.kfold == fold]['target'].copy()
    
    X_test = df_test_ohe[useful_features2].copy()
    
    valid_ids = df_ohe[df_ohe.kfold == fold].id.values.tolist()
    
    
    xgbr = XGBRegressor(
        n_jobs=-1,
        **xgbr_params
    )
    
    xgbr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    
    valid_pred = xgbr.predict(X_valid)
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    rmse_scores.append(rmse)
    
    test_pred = xgbr.predict(X_test)
    test_preds.append(test_pred)
    
    print(fold, rmse)

print("Mean RMSE: {}".format(np.mean(rmse_scores)))

valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
valid_preds.columns = ["id", "pred_1"]
valid_preds.to_csv("train_pred_1.csv", index=False)

df_submission_xgbr = df_submission.copy()
df_submission_xgbr['target'] = np.mean(np.column_stack(test_preds), axis=1)
df_submission_xgbr.columns = ["id", "pred_1"]
df_submission_xgbr.to_csv('test_pred_1.csv', index=False)

### OrdinalEncoder

In [None]:
valid_preds = {}
test_preds = []
rmse_scores = []

# Optimized params
xgbr_params = {'n_estimators': 25000,
 'learning_rate': 0.04131767863898798,
 'reg_alpha': 0.00426659506577938,
 'reg_lambda': 9.993333999320106,
 'colsample_bytree': 0.12151130135748905,
 'subsample': 0.7321334740173127,
 'max_depth': 2}

for fold in range(N_FOLDS):
    X_train = df[df.kfold != fold][useful_features].copy()
    y_train = df[df.kfold != fold]['target'].copy()
    
    X_valid = df[df.kfold == fold][useful_features].copy()
    y_valid = df[df.kfold == fold]['target'].copy()
    
    X_test = df_test[useful_features].copy()
    
    valid_ids = df[df.kfold == fold].id.values.tolist()
    
    oe = OrdinalEncoder()
    
    X_train[cat_cols] = oe.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = oe.transform(X_valid[cat_cols])
    X_test[cat_cols] = oe.transform(X_test[cat_cols])
    
    xgbr = XGBRegressor(
        random_state=42,
        n_jobs=-1,
        **xgbr_params
    )
    
    xgbr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    
    valid_pred = xgbr.predict(X_valid)
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    rmse_scores.append(rmse)
    
    test_pred = xgbr.predict(X_test)
    test_preds.append(test_pred)
    
    print(fold, rmse)

print("Mean RMSE: {}".format(np.mean(rmse_scores)))

valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
valid_preds.columns = ["id", "pred_2"]
valid_preds.to_csv("train_pred_2.csv", index=False)

df_submission_xgbr = df_submission.copy()
df_submission_xgbr['target'] = np.mean(np.column_stack(test_preds), axis=1)
df_submission_xgbr.columns = ["id", "pred_2"]
df_submission_xgbr.to_csv('test_pred_2.csv', index=False)

### MEstimateEncoder

In [None]:
valid_preds = {}
test_preds = []
rmse_scores = []

# Optimized params
xgbr_params = {'random_state': 36001,
 'n_estimators': 31500,
 'learning_rate': 0.06662729228509383,
 'reg_alpha': 1.7300508237054107e-08,
 'reg_lambda': 0.0228401079048692,
 'colsample_bytree': 0.10765459397825078,
 'subsample': 0.9961467305926972,
 'max_depth': 2}

for fold in range(N_FOLDS):
    X_train = df[df.kfold != fold][useful_features].copy()
    y_train = df[df.kfold != fold]['target'].copy()
    
    X_valid = df[df.kfold == fold][useful_features].copy()
    y_valid = df[df.kfold == fold]['target'].copy()
    
    X_test = df_test[useful_features].copy()
    
    valid_ids = df[df.kfold == fold].id.values.tolist()
    
    mee = MEstimateEncoder()
    
    X_train[cat_cols] = mee.fit_transform(X_train[cat_cols], y_train)
    X_valid[cat_cols] = mee.transform(X_valid[cat_cols])
    X_test[cat_cols] = mee.transform(X_test[cat_cols])
    
    xgbr = XGBRegressor(
        n_jobs=-1,
        **xgbr_params
    )
    
    xgbr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    
    valid_pred = xgbr.predict(X_valid)
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    rmse_scores.append(rmse)
    
    test_pred = xgbr.predict(X_test)
    test_preds.append(test_pred)
    
    print(fold, rmse)

print("Mean RMSE: {}".format(np.mean(rmse_scores)))

valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
valid_preds.columns = ["id", "pred_3"]
valid_preds.to_csv("train_pred_3.csv", index=False)

df_submission_xgbr = df_submission.copy()
df_submission_xgbr['target'] = np.mean(np.column_stack(test_preds), axis=1)
df_submission_xgbr.columns = ["id", "pred_3"]
df_submission_xgbr.to_csv('test_pred_3.csv', index=False)

## LightGBM Predictions

### OrdinalEncoder

In [None]:
valid_preds = {}
test_preds = []
rmse_scores = []

# Optimized params
lgbmr_params = {'random_state': 15501,
 'n_estimators': 11500,
 'reg_alpha': 1.0923032842333036,
 'reg_lambda': 0.0011961670045703618,
 'colsample_bytree': 0.10678842002938174,
 'subsample': 0.36744208881458135,
 'learning_rate': 0.04337334530673734,
 'max_depth': 91,
 'num_leaves': 7,
 'min_child_samples': 264,
 'cat_smooth': 36}

for fold in range(N_FOLDS):
    X_train = df[df.kfold != fold][useful_features].copy()
    y_train = df[df.kfold != fold]['target'].copy()
    
    X_valid = df[df.kfold == fold][useful_features].copy()
    y_valid = df[df.kfold == fold]['target'].copy()
    
    X_test = df_test[useful_features].copy()
    
    valid_ids = df[df.kfold == fold].id.values.tolist()
    
    oe = OrdinalEncoder()
    
    X_train[cat_cols] = oe.fit_transform(X_train[cat_cols])
    X_valid[cat_cols] = oe.transform(X_valid[cat_cols])
    X_test[cat_cols] = oe.transform(X_test[cat_cols])
    
    lgbmr = LGBMRegressor(
        n_jobs=-1,
        **lgbmr_params
    )
    
    lgbmr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    
    valid_pred = lgbmr.predict(X_valid)
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    rmse_scores.append(rmse)
    
    test_pred = lgbmr.predict(X_test)
    test_preds.append(test_pred)
    
    print(fold, rmse)

print("Mean RMSE: {}".format(np.mean(rmse_scores)))

valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
valid_preds.columns = ["id", "pred_4"]
valid_preds.to_csv("train_pred_4.csv", index=False)

df_submission_lgbmr = df_submission.copy()
df_submission_lgbmr['target'] = np.mean(np.column_stack(test_preds), axis=1)
df_submission_lgbmr.columns = ["id", "pred_4"]
df_submission_lgbmr.to_csv('test_pred_4.csv', index=False)

## Loading Train and Test Predictions

In [None]:
df1 = pd.read_csv("../input/30days-blend-data/train_pred_1.csv")
df2 = pd.read_csv("../input/30days-blend-data/train_pred_2.csv")
df3 = pd.read_csv("../input/30days-blend-data/train_pred_3.csv")
df4 = pd.read_csv("../input/30days-blend-data/train_pred_4.csv")
# df5 = pd.read_csv("train_pred_5.csv")
# df6 = pd.read_csv("train_pred_6.csv")
# df7 = pd.read_csv("train_pred_7.csv")
# df8 = pd.read_csv("train_pred_8.csv")
# df9 = pd.read_csv("train_pred_9.csv")
# df10 = pd.read_csv("train_pred_10.csv")

df_test1 = pd.read_csv("../input/30days-blend-data/test_pred_1.csv")
df_test2 = pd.read_csv("../input/30days-blend-data/test_pred_2.csv")
df_test3 = pd.read_csv("../input/30days-blend-data/test_pred_3.csv")
df_test4 = pd.read_csv("../input/30days-blend-data/test_pred_4.csv")
# df_test5 = pd.read_csv("test_pred_5.csv")
# df_test6 = pd.read_csv("test_pred_6.csv")
# df_test7 = pd.read_csv("test_pred_7.csv")
# df_test8 = pd.read_csv("test_pred_8.csv")
# df_test9 = pd.read_csv("test_pred_9.csv")
# df_test10 = pd.read_csv("test_pred_10.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")
df = df.merge(df4, on="id", how="left")
# df = df.merge(df5, on="id", how="left")
# df = df.merge(df6, on="id", how="left")
# df = df.merge(df7, on="id", how="left")
# df = df.merge(df8, on="id", how="left")
# df = df.merge(df9, on="id", how="left")
# df = df.merge(df10, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")
df_test = df_test.merge(df_test4, on="id", how="left")
# df_test = df_test.merge(df_test5, on="id", how="left")
# df_test = df_test.merge(df_test6, on="id", how="left")
# df_test = df_test.merge(df_test7, on="id", how="left")
# df_test = df_test.merge(df_test8, on="id", how="left")
# df_test = df_test.merge(df_test9, on="id", how="left")
# df_test = df_test.merge(df_test10, on="id", how="left")

df.head()

In [None]:
df_test.head()

In [None]:
df.shape, df1.shape, df_test1.shape

In [None]:
# "pred_5", "pred_6", "pred_7", "pred_8", "pred_9", "pred_10"
useful_features3 = ["pred_1", "pred_2", "pred_3", "pred_4"]

## XGBRegressor Hyperparameter Tuning for Blended Dataset

In [None]:
def run_xgbr(trial):
    fold = 6
    
    xgbr_params = {
        'random_state': trial.suggest_int("random_state", 1, 39501, step=500),
        'n_estimators': trial.suggest_int("n_estimators", 500, 40000, step=500),
        'learning_rate': trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
        'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-8, 100.0),
        'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-8, 100.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1.0),
        'subsample': trial.suggest_float("subsample", 0.1, 1.0),
        'max_depth': trial.suggest_int("max_depth", 1, 15),
        'booster': 'gbtree'
    }
    
    X_train = df[df.kfold != fold][useful_features3].copy()
    y_train = df[df.kfold != fold]['target'].copy()
    
    X_valid = df[df.kfold == fold][useful_features3].copy()
    y_valid = df[df.kfold == fold]['target'].copy()

    xgbr = XGBRegressor(
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_jobs=-1,
        **xgbr_params
    )

    xgbr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    valid_pred = xgbr.predict(X_valid)
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run_xgbr, n_trials=30)

In [None]:
study.best_params

## LightGBMRegressor Hyperparameter Tuning for Blended Dataset

In [None]:
def run_lgbmr(trial):
    fold = 6

    lgbmr_params = {
        'random_state': trial.suggest_int("random_state", 1, 39501, step=500),
        'metric': 'rmse',
        'objective': 'regression',
        'n_estimators': trial.suggest_int("n_estimators", 500, 30000, step=500),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1.0),
        'subsample': trial.suggest_float("subsample", 0.1, 1.0),
        'learning_rate': trial.suggest_float("learning_rate", 1e-2, 0.25, log=True),
        'max_depth': trial.suggest_int("max_depth", 1, 100),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('cat_smooth', 1, 100)
    }

    X_train = df[df.kfold != fold][useful_features3].copy()
    y_train = df[df.kfold != fold]['target'].copy()
    
    X_valid = df[df.kfold == fold][useful_features3].copy()
    y_valid = df[df.kfold == fold]['target'].copy()
    
    lgbmr = LGBMRegressor(
        device = 'gpu',
        gpu_platform_id = 0,
        gpu_device_id = 0,
        n_jobs=-1,
        **lgbmr_params
    )
    
    lgbmr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    valid_pred = lgbmr.predict(X_valid)
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(run_lgbmr, n_trials=30)

In [None]:
study.best_params

## XGBRegressor Predictions for Blended Dataset

In [None]:
valid_preds = {}
test_preds = []
rmse_scores = []

# Optimized params
xgbr_params = {'random_state': 501,
 'n_estimators': 13500,
 'learning_rate': 0.016009131284386868,
 'reg_alpha': 8.934466248992485e-08,
 'reg_lambda': 1.889567898176322e-07,
 'colsample_bytree': 0.8819901735790897,
 'subsample': 0.3510814160409993,
 'max_depth': 1,
 'booster': 'gbtree'}

for fold in range(N_FOLDS):
    X_train = df[df.kfold != fold][useful_features3].copy()
    y_train = df[df.kfold != fold]['target'].copy()
    
    X_valid = df[df.kfold == fold][useful_features3].copy()
    y_valid = df[df.kfold == fold]['target'].copy()
    
    X_test = df_test[useful_features3].copy()
    
    valid_ids = df[df.kfold == fold].id.values.tolist()
    
    xgbr = XGBRegressor(
        n_jobs=-1,
        **xgbr_params
    )

    xgbr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    
    valid_pred = xgbr.predict(X_valid)
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    rmse_scores.append(rmse)
    
    test_pred = xgbr.predict(X_test)
    test_preds.append(test_pred)
    
    print(fold, rmse)

print("Mean RMSE: {}".format(np.mean(rmse_scores)))

valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
valid_preds.columns = ["id", "blend_pred_1"]
valid_preds.to_csv("blend_train_pred_1.csv", index=False)

df_submission_xgbr = df_submission.copy()
df_submission_xgbr['target'] = np.mean(np.column_stack(test_preds), axis=1)
df_submission_xgbr.columns = ["id", "blend_pred_1"]
df_submission_xgbr.to_csv('blend_test_pred_1.csv', index=False)

## LightGBMRegressor Predictions for Blended Dataset

In [None]:
valid_preds = {}
test_preds = []
rmse_scores = []

# Optimized params
lgbmr_params = {'random_state': 31501,
 'n_estimators': 6500,
 'reg_alpha': 0.05821336452138286,
 'reg_lambda': 0.020420330390252723,
 'colsample_bytree': 0.4510821411914482,
 'subsample': 0.24089341054929064,
 'learning_rate': 0.010884158656739853,
 'max_depth': 84,
 'num_leaves': 10,
 'min_child_samples': 143,
 'cat_smooth': 19}

for fold in range(N_FOLDS):
    X_train = df[df.kfold != fold][useful_features3].copy()
    y_train = df[df.kfold != fold]['target'].copy()
    
    X_valid = df[df.kfold == fold][useful_features3].copy()
    y_valid = df[df.kfold == fold]['target'].copy()
    
    X_test = df_test[useful_features3].copy()
    
    valid_ids = df[df.kfold == fold].id.values.tolist()
    
    lgbmr = LGBMRegressor(
        n_jobs=-1,
        **lgbmr_params
    )
    
    lgbmr.fit(
        X_train, y_train,
        early_stopping_rounds=300,
        eval_set=[(X_valid, y_valid)],
        verbose=2000
    )
    
    valid_pred = lgbmr.predict(X_valid)
    valid_preds.update(dict(zip(valid_ids, valid_pred)))
    
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    rmse_scores.append(rmse)
    
    test_pred = lgbmr.predict(X_test)
    test_preds.append(test_pred)
    
    print(fold, rmse)

print("Mean RMSE: {}".format(np.mean(rmse_scores)))

valid_preds = pd.DataFrame.from_dict(valid_preds, orient="index").reset_index()
valid_preds.columns = ["id", "blend_pred_2"]
valid_preds.to_csv("blend_train_pred_2.csv", index=False)

df_submission_lgbmr = df_submission.copy()
df_submission_lgbmr['target'] = np.mean(np.column_stack(test_preds), axis=1)
df_submission_lgbmr.columns = ["id", "blend_pred_2"]
df_submission_lgbmr.to_csv('blend_test_pred_2.csv', index=False)

## Loading Blended Train and Test Predictions

In [None]:
df1 = pd.read_csv("../input/30days-stack-data/blend_train_pred_1.csv")
df2 = pd.read_csv("../input/30days-stack-data/blend_train_pred_2.csv")

df_test1 = pd.read_csv("../input/30days-stack-data/blend_test_pred_1.csv")
df_test2 = pd.read_csv("../input/30days-stack-data/blend_test_pred_2.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")

df.head()

In [None]:
df_test.head()

In [None]:
df.shape, df1.shape, df_test1.shape

In [None]:
useful_features4 = ["blend_pred_1", "blend_pred_2"]

## LinearRegression Predictions on Stacked Dataset

In [None]:
final_predictions = []
scores = []
for fold in range(N_FOLDS):
    X_train = df[df.kfold != fold][useful_features4].copy()
    y_train = df[df.kfold != fold]['target'].copy()
    
    X_valid = df[df.kfold == fold][useful_features4].copy()
    y_valid = df[df.kfold == fold]['target'].copy()
    
    X_test = df_test[useful_features4].copy()
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    preds_valid = model.predict(X_valid)
    test_preds = model.predict(X_test)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    scores.append(rmse)
    print(fold, rmse)

print(np.mean(scores), np.std(scores))

df_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
df_submission.to_csv("stack_submission_1.csv", index=False)