# Blending to Stacking...

Based on Abhishek Thakhur's [notebook "blending blending blending"](https://www.kaggle.com/abhishek/blending-blending-blending)

## Record of Submissions

| Date            | Mean RMSE | Std Dev  | Public Score | Rank | Comment                                                 |
|-----------------|-----------|----------|--------------|------|---------------------------------------------------------|
| 27 Aug at 11:33 | 0.716401  | 0.000905 | 0.71747      | 218  | Passive: Based on Abhishek's blending-blending-blending |
| 27 Aug at 14:19 | 0.716391  | 0.000906 | 0.71744      | 163  | Added untuned `CatBoostRegressor` |
| 27 Aug at 16:25 | 0.716396  | 0.000899 | 0.71745      | 173  | Added untuned `LightGBMRegressor`; competition heats up |
| 28 Aug at 10:59 | 0.716268  | 0.000985 | 0.71731      | 147  | First run of stacker; top score is 0.71697 |
| 28 Aug at 21:07 | 0.715997  | 0.000933 | 0.71722      | 48  | Added Level 1 data from [notestack](https://www.kaggle.com/shilpabadge/note-stacking) and [cd6-stacking](https://www.kaggle.com/jhsoft/competition-day-6-stacking10) notebooks |
| 28 Aug at 21:44 | 0.715138  | 0.000925 | 0.71823      | 48  | Added Google AutoML preds again. Score worse. |
| 29 Aug at 18:14 | 0.715990  | 0.000928 | 0.71721      | 52  | `CatBoostRegressor` handling cat data natively |




In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

---

# Data Loading and Saving

In [None]:
FOLDS_DATA_DIR = Path('../input/30days-folds')
TEST_DATA_DIR = Path('../input/30-days-of-ml')
L1_DATA = Path("../input/l1-data")

In [None]:
def load_data():
    df = pd.read_csv(FOLDS_DATA_DIR / "train_folds.csv")
    df_test = pd.read_csv(TEST_DATA_DIR / "test.csv")
    sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")

    useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
    object_cols = [col for col in useful_features if 'cat' in col]
    df_test = df_test[useful_features]
    
    return df, df_test, sample_submission, useful_features, object_cols

In [None]:
def save_data(final_valid_predictions, final_test_predictions, sample_submission, level, pred_number):
    final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
    final_valid_predictions.columns = ["id", f"pred_{ pred_number }"]
    final_valid_predictions.to_csv(f"level{ level }_train_pred_{ pred_number }.csv", index=False)

    sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
    sample_submission.columns = ["id", f"pred_{ pred_number }"]
    sample_submission.to_csv(f"level{ level }_test_pred_{ pred_number }.csv", index=False)
    
    print("\nSaved prediction files to output directory.")

---

# Models Included in the Blend

## XGBRegressor Model #1

- Mean RMSE score: 0.716423430212675 
- Standard Deviation: 0.0008883537054028858

In [None]:
df, df_test, sample_submission, useful_features, object_cols = load_data()

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 10000,
        'learning_rate': 0.03628302216953097,
        'reg_lambda': 0.0008746338866473539,
        'reg_alpha': 23.13181079976304,
        'subsample': 0.7875490025178415,
        'colsample_bytree': 0.11807135201147481,
        'max_depth': 3
    }
    
    model = XGBRegressor(
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=0, pred_number=1)

## XGBRegressor #2

- Mean RMSE score: 0.716751801740166  
- Standard Deviation: 0.0009449448387318704

In [None]:
df, df_test, sample_submission, useful_features, object_cols = load_data()

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
        'learning_rate': 0.07853392035787837,
        'reg_lambda': 1.7549293092194938e-05,
        'reg_alpha': 14.68267919457715, 
        'subsample': 0.8031450486786944, 
        'colsample_bytree': 0.170759104940733, 
        'max_depth': 3
    }
    
    model = XGBRegressor(
        random_state=fold,
        n_jobs=4,
        n_estimators=5000,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=0, pred_number=2)

## CatBoostRegressor

Loss indicator improved after allowing `CatBoostRegressor` to run on CPU and to handle categorical features natively, rather than receiving them with ordinal encoding.

- Mean RMSE score: ~0.7357652266177294~ 0.72463705597803   
- Standard Deviation: ~0.0005995252889098014~ 0.0006605264189716528

In [None]:
df, df_test, sample_submission, useful_features, object_cols = load_data()

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    params = dict(
        # Abhishek reported (rightly) that running on CPU is more accurate
        #task_type = "GPU",
        #devices = '0',
        cat_features = object_cols,
        iterations=6800,
        learning_rate=0.07853392035787837,
        loss_function="RMSE",
        verbose=1000,
        thread_count=4,
        depth=1,
        l2_leaf_reg=3.28,
        random_state=fold,
    )
    
    model = CatBoostRegressor( **params )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)])
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=0, pred_number=3)

## LGBMRegressor

- Mean RMSE score: nnn   
- Standard Deviation: nnn

In [None]:
df, df_test, sample_submission, useful_features, object_cols = load_data()

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    # baseline model of https://www.kaggle.com/xaviernuelgavald/xgboost-and-lgbm-comparison
    params = dict(
        n_estimators = 3000,
        learning_rate = 0.05,
        num_leaves = 256,
        max_depth = 7,
        min_data_in_leaf = 40,
        # Abhishek reported that running on CPU is more accurate
        #device='gpu',
        random_state=fold,
        verbose = -1,
    )
    
    model = LGBMRegressor(**params)  
    model.fit(xtrain, ytrain,
              early_stopping_rounds=300,
              eval_set=[(xvalid, yvalid)],
              verbose=False,
             )
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=0, pred_number=4)

---

# Blending Stage: Level 0 Submission


In [None]:
# Load level 0 predictions

df = pd.read_csv(FOLDS_DATA_DIR / "train_folds.csv")
df_test = pd.read_csv(TEST_DATA_DIR / "test.csv")
sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")

for fn in range(1, 5):
    tmp_df = pd.read_csv( f"level0_train_pred_{ fn }.csv" )
    df = df.merge(tmp_df, on="id", how="left")
    
    tmp_test_df = pd.read_csv( f"level0_test_pred_{ fn }.csv" )
    df_test = df_test.merge(tmp_test_df, on="id", how="left")

In [None]:
useful_features = ["pred_1", "pred_2", "pred_3", "pred_4"]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission_level_0.csv", index=False)

---

# Stacking Stage: Level 1 Predictions


In [None]:
# Reload level 0 predictions

df = pd.read_csv(FOLDS_DATA_DIR / "train_folds.csv")
df_test = pd.read_csv(TEST_DATA_DIR / "test.csv")

for fn in range(1, 5):
    tmp_df = pd.read_csv( f"level0_train_pred_{ fn }.csv" )
    df = df.merge(tmp_df, on="id", how="left")
    
    tmp_test_df = pd.read_csv( f"level0_test_pred_{ fn }.csv" )
    df_test = df_test.merge(tmp_test_df, on="id", how="left")

## L1 Stacking: XGBRegressor

In [None]:
sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")
useful_features = ["pred_1", "pred_2", "pred_3", "pred_4"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    

    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 7000,
        'learning_rate': 0.03,
        'max_depth': 2
    }
    
    model = XGBRegressor(
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=1, pred_number=1)

## L1 Stacking: RandomForestRegressor

In [None]:
sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")
useful_features = ["pred_1", "pred_2", "pred_3", "pred_4"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = RandomForestRegressor(n_estimators=500, n_jobs=-1, max_depth=3)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=1, pred_number=2)

## L1 Stacking: GradientBoostingRegressor

In [None]:
sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")
useful_features = ["pred_1", "pred_2", "pred_3", "pred_4"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = GradientBoostingRegressor(n_estimators=500, max_depth=3)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=1, pred_number=3)

---

# L1 Stacking: LinearRegression Submission

At this stage, some public domain datasets are added to the mix from other notebooks with thanks to @shilpabadge and @jhsoft.

- https://www.kaggle.com/shilpabadge/note-stacking
- and from https://www.kaggle.com/jhsoft/competition-day-6-stacking10/notebook

In [None]:
df = pd.read_csv(FOLDS_DATA_DIR / "train_folds.csv")
df_test = pd.read_csv(TEST_DATA_DIR / "test.csv")
sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")

df1 = pd.read_csv("level1_train_pred_1.csv")  # output from above blends
df2 = pd.read_csv("level1_train_pred_2.csv")  #
df3 = pd.read_csv("level1_train_pred_3.csv")  #
df4 = pd.read_csv(L1_DATA / "level1_train_pred_4.csv")
df5 = pd.read_csv(L1_DATA / "level1_train_pred_5.csv")
df6 = pd.read_csv(L1_DATA / "level1_train_pred_6.csv")
df7 = pd.read_csv(L1_DATA / "level1_train_pred_7.csv")
df8 = pd.read_csv(L1_DATA / "level1_train_pred_8.csv")
df9 = pd.read_csv(L1_DATA / "level1_train_pred_9.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")
df = df.merge(df4, on="id", how="left")
df = df.merge(df5, on="id", how="left")
df = df.merge(df6, on="id", how="left")
df = df.merge(df7, on="id", how="left")
df = df.merge(df8, on="id", how="left")
df = df.merge(df9, on="id", how="left")

df_test1 = pd.read_csv("level1_test_pred_1.csv")  # output from above blends
df_test2 = pd.read_csv("level1_test_pred_2.csv")  # 
df_test3 = pd.read_csv("level1_test_pred_3.csv")  # 
df_test4 = pd.read_csv(L1_DATA / "level1_test_pred_4.csv")
df_test5 = pd.read_csv(L1_DATA / "level1_test_pred_5.csv")
df_test6 = pd.read_csv(L1_DATA / "level1_test_pred_6.csv")
df_test7 = pd.read_csv(L1_DATA / "level1_test_pred_7.csv")
df_test8 = pd.read_csv(L1_DATA / "level1_test_pred_8.csv")
df_test9 = pd.read_csv(L1_DATA / "level1_test_pred_9.csv")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")
df_test = df_test.merge(df_test4, on="id", how="left")
df_test = df_test.merge(df_test5, on="id", how="left")
df_test = df_test.merge(df_test6, on="id", how="left")
df_test = df_test.merge(df_test7, on="id", how="left")
df_test = df_test.merge(df_test8, on="id", how="left")
df_test = df_test.merge(df_test9, on="id", how="left")

df.head()

In [None]:
useful_features = [c for c in df.columns if c.startswith('pred_')]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission_level_1.csv", index=False)

---

# Stacking Stage: Level 2 Predictions

This is an experiment to generate yet another blend, possibly risking over-fitting... every effort made below here went crazy.


In [None]:
# Load level 1 predictions

df = pd.read_csv(FOLDS_DATA_DIR / "train_folds.csv")
df_test = pd.read_csv(TEST_DATA_DIR / "test.csv")

for fn in range(1, 4):
    tmp_df = pd.read_csv( f"level1_train_pred_{ fn }.csv" )
    df = df.merge(tmp_df, on="id", how="left")
    
    tmp_test_df = pd.read_csv( f"level1_test_pred_{ fn }.csv" )
    df_test = df_test.merge(tmp_test_df, on="id", how="left")

In [None]:
sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")
useful_features = ["pred_1", "pred_2", "pred_3"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    

    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 7000,
        'learning_rate': 0.03,
        'max_depth': 2
    }
    
    model = XGBRegressor(
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=2, pred_number=1)

In [None]:
sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")
useful_features = ["pred_1", "pred_2", "pred_3"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = RandomForestRegressor(n_estimators=500, n_jobs=-1, max_depth=3)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=2, pred_number=2)

In [None]:
sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")
useful_features = ["pred_1", "pred_2", "pred_3"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = GradientBoostingRegressor(n_estimators=500, max_depth=3)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

save_data(final_valid_predictions, final_test_predictions, sample_submission, level=2, pred_number=3)

In [None]:
df = pd.read_csv(FOLDS_DATA_DIR / "train_folds.csv")
df_test = pd.read_csv(TEST_DATA_DIR / "test.csv")
sample_submission = pd.read_csv(TEST_DATA_DIR / "sample_submission.csv")

df1 = pd.read_csv("level2_train_pred_1.csv")  # output from above blends
df2 = pd.read_csv("level2_train_pred_2.csv")  #
#df3 = pd.read_csv("level2_train_pred_3.csv")  #

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
#df = df.merge(df3, on="id", how="left")

df_test1 = pd.read_csv("level2_test_pred_1.csv")  # output from above blends
df_test2 = pd.read_csv("level2_test_pred_2.csv")  # 
#df_test3 = pd.read_csv("level2_test_pred_3.csv")  # 

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
#df_test = df_test.merge(df_test3, on="id", how="left")

df.head()

In [None]:
useful_features = [c for c in df.columns if c.startswith('pred_')]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission_level_2.csv", index=False)