In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
            'n_estimators': 4000,
            'learning_rate': 0.11201986769683442,
            'max_depth': 2,
            'subsample': 0.95,
            'colsample_bytree': 0.31641470495382984,
            'min_child_weight': 29,
#             'gammma': 0.002281092454035713,
            'reg_alpha': 24.755545205748753,
            'reg_lambda': 0.0019617732677993317,
    }
    
    model = XGBRegressor(
        tree_method='gpu_hist',
        gpu_id=0,
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_6"]
final_valid_predictions.to_csv("train_pred_6.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_6"]
sample_submission.to_csv("test_pred_6.csv", index=False)

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
            'n_estimators': 5500,
            'learning_rate': 0.08513702795576736,
            'max_depth': 3,
            'subsample': 0.84,
            'colsample_bytree': 0.4306550188086389,
            'min_child_weight': 62,
#             'gammma': 2.5255984324552263,
            'reg_alpha': 0.01767047929131846,
            'reg_lambda': 0.099725265314572,
        
    }
    
    model = XGBRegressor(
        random_state=fold,
        tree_method='gpu_hist',
        gpu_id=0,
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_7"]
final_valid_predictions.to_csv("train_pred_7.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_7"]
sample_submission.to_csv("test_pred_7.csv", index=False)

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
            'n_estimators': 6000,
            'learning_rate': 0.03854075470695709,
            'max_depth': 4,
            'subsample': 0.78,
            'colsample_bytree': 0.1752978982571639,
            'min_child_weight': 37,
#             'gammma': 3.8394925536670776e-07,
            'reg_alpha': 0.000697976480249658,
            'reg_lambda': 0.04651536374944249,
    }
    
    model = XGBRegressor(
        random_state=fold,
        tree_method='gpu_hist',
        gpu_id=0,
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_8"]
final_valid_predictions.to_csv("train_pred_8.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_8"]
sample_submission.to_csv("test_pred_8.csv", index=False)

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
            'n_estimators': 10000,
            'learning_rate': 0.026023888072592722,
            'max_depth': 4,
            'subsample': 0.7267737013954487,
            'colsample_bytree': 0.15329790252866196,
            'min_child_weight': 17,
#             'gammma': 4.272786590161498e-08,
            'reg_alpha': 16.696943145087175,
            'reg_lambda': 5.518560426875543e-06,
    }
    
    model = XGBRegressor(
        random_state=fold,
        tree_method='gpu_hist',
        gpu_id=0,
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_9"]
final_valid_predictions.to_csv("train_pred_9.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_9"]
sample_submission.to_csv("test_pred_9.csv", index=False)

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {
            'n_estimators': 9500,
            'learning_rate': 0.016568168505392528,
            'max_depth': 4,
            'subsample': 0.634269259918919,
            'colsample_bytree': 0.10129905466017634,
            'min_child_weight': 45,
#             'gammma': 1.0082704930240635e-06,
            'reg_alpha': 0.01979371230334188,
            'reg_lambda': 6.5066354761049e-08,
    }
    
    model = XGBRegressor(
        random_state=fold,
        tree_method='gpu_hist',
        gpu_id=0,
        n_jobs=4,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_10"]
final_valid_predictions.to_csv("train_pred_10.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_10"]
sample_submission.to_csv("test_pred_10.csv", index=False)

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

# df1 = pd.read_csv("train_pred_1.csv")
# df2 = pd.read_csv("train_pred_2.csv")
# df3 = pd.read_csv("train_pred_3.csv")
# df4 = pd.read_csv("train_pred_4.csv")
# df5 = pd.read_csv("train_pred_5.csv")
df6 = pd.read_csv("train_pred_6.csv")
df7 = pd.read_csv("train_pred_7.csv")
df8 = pd.read_csv("train_pred_8.csv")
df9 = pd.read_csv("train_pred_9.csv")
df10 = pd.read_csv("train_pred_10.csv")

# df_test1 = pd.read_csv("test_pred_1.csv")
# df_test2 = pd.read_csv("test_pred_2.csv")
# df_test3 = pd.read_csv("test_pred_3.csv")
# df_test4 = pd.read_csv("test_pred_4.csv")
# df_test5 = pd.read_csv("test_pred_5.csv")
df_test6 = pd.read_csv("test_pred_6.csv")
df_test7 = pd.read_csv("test_pred_7.csv")
df_test8 = pd.read_csv("test_pred_8.csv")
df_test9 = pd.read_csv("test_pred_9.csv")
df_test10 = pd.read_csv("test_pred_10.csv")


# df = df.merge(df1, on="id", how="left")
# df = df.merge(df2, on="id", how="left")
# df = df.merge(df3, on="id", how="left")
# df = df.merge(df4, on="id", how="left")
# df = df.merge(df5, on="id", how="left")
df = df.merge(df6, on="id", how="left")
df = df.merge(df7, on="id", how="left")
df = df.merge(df8, on="id", how="left")
df = df.merge(df9, on="id", how="left")
df = df.merge(df10, on="id", how="left")

# df_test = df_test.merge(df_test1, on="id", how="left")
# df_test = df_test.merge(df_test2, on="id", how="left")
# df_test = df_test.merge(df_test3, on="id", how="left")
# df_test = df_test.merge(df_test4, on="id", how="left")
# df_test = df_test.merge(df_test5, on="id", how="left")
df_test = df_test.merge(df_test6, on="id", how="left")
df_test = df_test.merge(df_test7, on="id", how="left")
df_test = df_test.merge(df_test8, on="id", how="left")
df_test = df_test.merge(df_test9, on="id", how="left")
df_test = df_test.merge(df_test10, on="id", how="left")


df.head()

In [None]:
# sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
# useful_features = ["pred_1", "pred_2", "pred_3", "pred_4", "pred_5"]
# df_test = df_test[useful_features]

# final_predictions = []
# scores = {}
# for fold in range(5):
#     xtrain =  df[df.kfold != fold].reset_index(drop=True)
#     xvalid = df[df.kfold == fold].reset_index(drop=True)
#     xtest = df_test.copy()

#     ytrain = xtrain.target
#     yvalid = xvalid.target
    
#     xtrain = xtrain[useful_features]
#     xvalid = xvalid[useful_features]
    
#     model = LinearRegression()
#     model.fit(xtrain, ytrain)
    
#     preds_valid = model.predict(xvalid)
#     test_preds = model.predict(xtest)
#     final_predictions.append(test_preds)
#     rmse = mean_squared_error(yvalid, preds_valid, squared=False)
#     print(fold, rmse)
#     scores.append(rmse)

# print(np.mean(scores), np.std(scores))

In [None]:
F_S=5

In [None]:
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
useful_features = ["pred_6", "pred_7", "pred_8", "pred_9", "pred_10"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    

    params = {
        'random_state': 1, 
        'booster': 'gbtree',
        'n_estimators': 6000,
        'learning_rate': 0.03854,
        'max_depth': 4
    }
    
    model = XGBRegressor(
        n_jobs=4,
        tree_method='gpu_hist',
        gpu_id=0,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("level1_train_pred_1.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_1"]
sample_submission.to_csv("level1_test_pred_1.csv", index=False)

In [None]:
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
useful_features = ["pred_6", "pred_7", "pred_8", "pred_9", "pred_10"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = RandomForestRegressor(n_estimators=500, n_jobs=-1, max_depth=4)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("level1_train_pred_2.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_2"]
sample_submission.to_csv("level1_test_pred_2.csv", index=False)

In [None]:
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
useful_features = ["pred_6", "pred_7", "pred_8", "pred_9", "pred_10"]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = GradientBoostingRegressor(n_estimators=500, max_depth=4)
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("level1_train_pred_3.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_3"]
sample_submission.to_csv("level1_test_pred_3.csv", index=False)

In [None]:
# sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
# useful_features = ["pred_6", "pred_7", "pred_8", "pred_9", "pred_10"]
# df_test = df_test[useful_features]

# final_test_predictions = []
# final_valid_predictions = {}
# scores = []
# for fold in range(5):
#     xtrain =  df[df.kfold != fold].reset_index(drop=True)
#     xvalid = df[df.kfold == fold].reset_index(drop=True)
#     xtest = df_test.copy()

#     valid_ids = xvalid.id.values.tolist()

#     ytrain = xtrain.target
#     yvalid = xvalid.target
    
#     xtrain = xtrain[useful_features]
#     xvalid = xvalid[useful_features]
    
#     model = GradientBoostingRegressor(n_estimators=500, max_depth=4)
#     model.fit(xtrain, ytrain)
#     preds_valid = model.predict(xvalid)
#     test_preds = model.predict(xtest)
#     final_test_predictions.append(test_preds)
#     final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
#     rmse = mean_squared_error(yvalid, preds_valid, squared=False)
#     print(fold, rmse)
#     scores.append(rmse)

# print(np.mean(scores), np.std(scores))
# final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
# final_valid_predictions.columns = ["id", "pred_3"]
# final_valid_predictions.to_csv("level1_train_pred_3.csv", index=False)

# sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
# sample_submission.columns = ["id", "pred_3"]
# sample_submission.to_csv("level1_test_pred_3.csv", index=False)

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

df1 = pd.read_csv("level1_train_pred_1.csv")
df2 = pd.read_csv("level1_train_pred_2.csv")
df3 = pd.read_csv("level1_train_pred_3.csv")
# df4 = pd.read_csv("level1_train_pred_4.csv")

df_test1 = pd.read_csv("level1_test_pred_1.csv")
df_test2 = pd.read_csv("level1_test_pred_2.csv")
df_test3 = pd.read_csv("level1_test_pred_3.csv")
# df_test4 = pd.read_csv("level1_test_pred_4.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")
# df = df.merge(df4, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")
# df_test = df_test.merge(df_test4, on="id", how="left")

df.head()

In [None]:
useful_features = ["pred_1", "pred_2", "pred_3"]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)