In [None]:
#Setup libraries
import pandas as my_pd
import numpy as my_np
import optuna
import plotly

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder #categorical
from sklearn.preprocessing import OneHotEncoder #categorical
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from category_encoders.cat_boost import CatBoostEncoder #categorical
from sklearn.preprocessing import StandardScaler #numerical
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [None]:
#declaring num of folds
num_of_folds = 5

In [None]:
########for prep 1 level 0 setup############### 
df = my_pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = my_pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = my_pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
#Extract list of features except id, target and kfold
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
#Extract the list of categorical columns
object_cols = [col for col in useful_features if 'cat' in col]
#Extract the list of numerical columns
numerical_cols = [col for col in useful_features if 'cont' in col]
#Apply to test dataframe
df_test = df_test[useful_features]

In [None]:
#Generating the first prediction results
#initialization
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [None]:
#Prep 1 Tuned parameters using optuna
for fold in range(num_of_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    #Categorical Features ordinal approach
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #standardization approach to numerical columns
    num_scaler = StandardScaler()
    xtrain[numerical_cols] = num_scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = num_scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = num_scaler.transform(xtest[numerical_cols])
    #Best tuned parameters based on fold 0
    lgbm_params = {
        'n_estimators': 11692, 
        'max_depth': 4, 
        'learning_rate': 0.014266620585256642, 
        'num_leaves': 271, 
        'feature_fraction': 0.15945513102315187, 
        'bagging_fraction': 0.4728139218848143, 
        'bagging_freq': 6, 
        'max_bin': 2787, 
        'subsample': 1.0, 
        'colsample_bytree': 0.6, 
        'reg_alpha': 24.800000000000004, 
        'reg_lambda': 52.300000000000004, 
        'min_data_in_leaf': 10, 
        'min_child_samples': 73
    }
    
    LGBM_model_1 = LGBMRegressor(**lgbm_params, random_state=42, n_jobs=-1)
    LGBM_model_1.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=False)
    preds_valid = LGBM_model_1.predict(xvalid)
    test_preds = LGBM_model_1.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print("###1### fold = ", fold," RMSE = ", rmse)
    scores.append(rmse)

print("Mean RMSE = ", my_np.mean(scores)," Standard deviation = ", my_np.std(scores))

In [None]:
final_valid_predictions = my_pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_1.csv", index=False)

In [None]:
sample_submission.target = my_np.mean(my_np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_1"]
sample_submission.to_csv("test_pred_1.csv", index=False)

In [None]:
########for prep 2 level 0 setup###############
df = my_pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = my_pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = my_pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
#Select all except id, target and  kfold columns
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

In [None]:
#creating polynomial features from numerical features
poly = PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(df_test[numerical_cols])

df_poly = my_pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = my_pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

df = my_pd.concat([df, df_poly], axis=1)
df_test = my_pd.concat([df_test, df_test_poly], axis=1)

In [None]:
#Reselect to include the generated polynomial features 
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

In [None]:
#initialization...
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [None]:
#Prep 2 Tuned parameters using optuna
for fold in range(num_of_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    #Ordinal encoding for categorical features
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])

    #Best tuned parameters based on fold 0
    lgbm_params = {
        'n_estimators': 8843, 
        'max_depth': 3, 
        'learning_rate': 0.04029137517665557, 
        'num_leaves': 251, 
        'feature_fraction': 0.9196251401488119, 
        'bagging_fraction': 0.9993541403213315, 
        'bagging_freq': 14, 
        'max_bin': 2795, 
        'subsample': 0.1, 
        'colsample_bytree': 0.2, 
        'reg_alpha': 67.6, 
        'reg_lambda': 106.6, 
        'min_data_in_leaf': 106, 
        'min_child_samples': 59
    }
    
    LGBM_model_2 = LGBMRegressor(**lgbm_params, random_state=42, n_jobs=-1)
    LGBM_model_2.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=False)
    preds_valid = LGBM_model_2.predict(xvalid)
    test_preds = LGBM_model_2.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print("###2### fold = ", fold," RMSE = ", rmse)
    scores.append(rmse)

print("Mean RMSE = ", my_np.mean(scores)," Standard deviation = ", my_np.std(scores))

In [None]:
final_valid_predictions = my_pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("train_pred_2.csv", index=False)

In [None]:
sample_submission.target = my_np.mean(my_np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_2"]
sample_submission.to_csv("test_pred_2.csv", index=False)

In [None]:
########for prep 3 level 0 setup###############
df = my_pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = my_pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = my_pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
#Target encoding approach with additive smoothing for categorical data
for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        #print("*****************Fold = ", fold)
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        #feat = xtrain.groupby(col)["target"].agg("median")
        # Compute the global mean
        mean = df["target"].mean()
        #print("Global mean = ", mean)
        my_agg = xtrain.groupby(col)["target"].agg(["count", "mean"])
        #print("my_agg :", my_agg)
        counts = my_agg["count"]
        #print("counts :", counts)
        means = my_agg["mean"]
        #print("means :", means)
        feat = (counts * means + 300 * mean) / (counts + 300)
        #print("Feat :", feat)
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = my_pd.concat(temp_df)

In [None]:
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
#make sure include the target encoding feature created
object_cols = [col for col in useful_features if col.startswith("cat")]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

In [None]:
#initialization
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [None]:
#Prep 3 Tuned parameters using optuna
for fold in range(num_of_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    #One hot encoding for categorical features
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols_train = my_pd.DataFrame(OH_encoder.fit_transform(xtrain[object_cols]))
    OH_cols_valid = my_pd.DataFrame(OH_encoder.transform(xvalid[object_cols]))
    OH_cols_test = my_pd.DataFrame(OH_encoder.transform(xtest[object_cols]))
    
    # One-hot encoding removed index; put it back
    OH_cols_train.index = xtrain.index
    OH_cols_valid.index = xvalid.index
    OH_cols_test.index = xtest.index
     
    # Remove categorical columns (will replace with one-hot encoding)
    num_X_train = xtrain.drop(object_cols, axis=1)
    num_X_valid = xvalid.drop(object_cols, axis=1)
    num_X_test = xtest.drop(object_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    OH_X_train = my_pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_valid = my_pd.concat([num_X_valid, OH_cols_valid], axis=1)
    OH_X_test = my_pd.concat([num_X_test, OH_cols_test], axis=1)

   ##########Numerical##########
    #standardization approach to numerical columns
    num_scaler = StandardScaler()
    OH_X_train[numerical_cols] = num_scaler.fit_transform(OH_X_train[numerical_cols])
    OH_X_valid[numerical_cols] = num_scaler.transform(OH_X_valid[numerical_cols])
    OH_X_test[numerical_cols] = num_scaler.transform(OH_X_test[numerical_cols])
    
    #Best tuned parameters based on fold 0
    lgbm_params = {
        'n_estimators': 3929, 
        'max_depth': 3, 
        'learning_rate': 0.07366853778420801, 
        'num_leaves': 153, 
        'feature_fraction': 0.11974384505538949, 
        'bagging_fraction': 0.6957549032324082, 
        'bagging_freq': 11, 
        'max_bin': 4479, 
        'subsample': 0.30000000000000004, 
        'colsample_bytree': 0.30000000000000004, 
        'reg_alpha': 20.500000000000004, 
        'reg_lambda': 92.7, 
        'min_data_in_leaf': 105, 
        'min_child_samples': 135
    }
    
    LGBM_model_3 = LGBMRegressor(**lgbm_params, random_state=42, n_jobs=-1)
    LGBM_model_3.fit(OH_X_train, ytrain, early_stopping_rounds=300, eval_set=[(OH_X_valid, yvalid)], verbose=False)
    preds_valid = LGBM_model_3.predict(OH_X_valid)
    test_preds = LGBM_model_3.predict(OH_X_test)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print("###3### fold = ", fold," RMSE = ", rmse)
    scores.append(rmse)

print("Mean RMSE = ", my_np.mean(scores)," Standard deviation = ", my_np.std(scores))

In [None]:
final_valid_predictions = my_pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_3"]
final_valid_predictions.to_csv("train_pred_3.csv", index=False)

In [None]:
sample_submission.target = my_np.mean(my_np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_3"]
sample_submission.to_csv("test_pred_3.csv", index=False)

In [None]:
########for prep 4 level 0 setup###############
df = my_pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = my_pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = my_pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
#Extract categorical and numerical features
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

In [None]:
#Initialization
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [None]:
#Pred 4 Tuned parameters using optuna
for fold in range(num_of_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    #Categorical Features ordinal approach
    ordinal_encoder = OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #Log transformation to numerical columns
    logtransformer = FunctionTransformer(my_np.log1p)
    xtrain[numerical_cols] = logtransformer.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = logtransformer.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = logtransformer.transform(xtest[numerical_cols])
    
    #Best tuned parameters based on fold 1
    lgbm_params = {
        'n_estimators': 3625, 
        'max_depth': 11, 
        'learning_rate': 0.034230498273737156, 
        'num_leaves': 21, 
        'feature_fraction': 0.13619234154861645, 
        'bagging_fraction': 0.6606506089603802, 
        'bagging_freq': 12, 
        'max_bin': 2498, 
        'subsample': 0.6, 
        'colsample_bytree': 0.2, 
        'reg_alpha': 24.500000000000004, 
        'reg_lambda': 12.6, 
        'min_data_in_leaf': 113, 
        'min_child_samples': 136
    }
    
    LGBM_model_4 = LGBMRegressor(**lgbm_params, random_state=42, n_jobs=-1)
    LGBM_model_4.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=False)
    preds_valid = LGBM_model_4.predict(xvalid)
    test_preds = LGBM_model_4.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print("###4### fold = ", fold," RMSE = ", rmse)
    scores.append(rmse)

print("Mean RMSE = ", my_np.mean(scores)," Standard deviation = ", my_np.std(scores))

final_valid_predictions = my_pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_4"]
final_valid_predictions.to_csv("train_pred_4.csv", index=False)

sample_submission.target = my_np.mean(my_np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_4"]
sample_submission.to_csv("test_pred_4.csv", index=False)

In [None]:
########for prep 5 level 0 setup###############
df = my_pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = my_pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = my_pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
#Extract list of features except id, target and kfold
useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
#Extract the list of categorical columns
object_cols = [col for col in useful_features if 'cat' in col]
#Extract the list of numerical columns
numerical_cols = [col for col in useful_features if 'cont' in col]
#Apply to test dataframe
df_test = df_test[useful_features]

In [None]:
#Initialization
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [None]:
#Pred 5 Tuned parameters
for fold in range(num_of_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    #Categorical Features ordinal approach
    #Catboost Encoder Categorical columns
    CBE_encoder = CatBoostEncoder()
    xtrain[object_cols] = CBE_encoder.fit_transform(xtrain[object_cols], ytrain)
    xvalid[object_cols] = CBE_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = CBE_encoder.transform(xtest[object_cols]) 
    
    #standardization approach to numerical columns
    num_scaler = StandardScaler()
    xtrain[numerical_cols] = num_scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = num_scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = num_scaler.transform(xtest[numerical_cols])
    #Best tuned parameters based on fold 1
    lgbm_params = {
        'n_estimators': 14625, 
        'max_depth': 9, 
        'learning_rate': 0.01168710233322345, 
        'num_leaves': 10, 
        'feature_fraction': 0.32238328745223305, 
        'bagging_fraction': 0.7090167545870307, 
        'bagging_freq': 4, 
        'max_bin': 2065, 
        'subsample': 0.4, 
        'colsample_bytree': 1.0, 
        'reg_alpha': 17.6, 
        'reg_lambda': 53.400000000000006, 
        'min_data_in_leaf': 78, 
        'min_child_samples': 110
    }
    
    LGBM_model_5 = LGBMRegressor(**lgbm_params, random_state=42, n_jobs=-1)
    LGBM_model_5.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=False)
    preds_valid = LGBM_model_5.predict(xvalid)
    test_preds = LGBM_model_5.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print("###5### fold = ", fold," RMSE = ", rmse)
    scores.append(rmse)

print("Mean RMSE = ", my_np.mean(scores)," Standard deviation = ", my_np.std(scores))

In [None]:
final_valid_predictions = my_pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_5"]
final_valid_predictions.to_csv("train_pred_5.csv", index=False)

In [None]:
sample_submission.target = my_np.mean(my_np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "pred_5"]
sample_submission.to_csv("test_pred_5.csv", index=False)

In [None]:
#Reinitialize the dataframe for predicted results merging
df = my_pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = my_pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = my_pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
#Prep for level 0 merging of predicted data
df1 = my_pd.read_csv("train_pred_1.csv")
df2 = my_pd.read_csv("train_pred_2.csv")
df3 = my_pd.read_csv("train_pred_3.csv")
df4 = my_pd.read_csv("train_pred_4.csv")
df5 = my_pd.read_csv("train_pred_5.csv")

df_test1 = my_pd.read_csv("test_pred_1.csv")
df_test2 = my_pd.read_csv("test_pred_2.csv")
df_test3 = my_pd.read_csv("test_pred_3.csv")
df_test4 = my_pd.read_csv("test_pred_4.csv")
df_test5 = my_pd.read_csv("test_pred_5.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")
df = df.merge(df4, on="id", how="left")
df = df.merge(df5, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")
df_test = df_test.merge(df_test4, on="id", how="left")
df_test = df_test.merge(df_test5, on="id", how="left")

In [None]:
#Extract the level 0 predicted results cols.
useful_features = ["pred_1", "pred_2", "pred_3", "pred_4", "pred_5"]
df_test = df_test[useful_features]
print(df.head())
print(df_test.head())

In [None]:
#initialization for level 1 prediction results
final_test_predictions = []
final_valid_predictions = {}
scores = []

In [None]:
#Level 1 Random Forest Pred 1 with tuned parameters
for fold in range(num_of_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    #Best tuned parameters based on fold 0
    rf_params = { 'n_estimators': 600, 'max_depth': 5, 'max_features': 0.5700919740571448, 'min_samples_split': 15, 'min_samples_leaf': 15 }
    
    RF_model = RandomForestRegressor(**rf_params, random_state=42, n_jobs=-1)
    RF_model.fit(xtrain, ytrain)
    preds_valid = RF_model.predict(xvalid)
    test_preds = RF_model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print("###RF### fold = ", fold," RMSE = ", rmse)
    scores.append(rmse)
#Save to files
print("Mean RMSE = ", my_np.mean(scores)," Standard deviation = ", my_np.std(scores))
final_valid_predictions = my_pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "lvl1pred_1"]
final_valid_predictions.to_csv("lvl1_train_pred_1.csv", index=False)

sample_submission.target = my_np.mean(my_np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "lvl1pred_1"]
sample_submission.to_csv("lvl1_test_pred_1.csv", index=False)

In [None]:
#Reinitialization for XBG model
final_test_predictions = []
final_valid_predictions = {}
scores = []
sample_submission = my_pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
#Level 1 XGB Pred 2 with tuned parameters
for fold in range(num_of_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    #Best tuned parameters based on fold 0
    xgb_params = {
        'n_estimators': 3630, 
        'max_depth': 2, 
        'learning_rate': 0.04303064869486619, 
        'colsample_bytree': 0.677745729203771, 
        'subsample': 0.2482377859197017, 
        'alpha': 0.019836014804809354, 
        'lambda': 1.1315039974499819e-08, 
        'min_child_weight': 692.2453262362711
    }
    
    XGB_model = XGBRegressor(**xgb_params, random_state=42, n_jobs=-1)
    XGB_model.fit(xtrain, ytrain, early_stopping_rounds=200, eval_set=[(xvalid, yvalid)], verbose=False)
    preds_valid = XGB_model.predict(xvalid)
    test_preds = XGB_model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print("###XGB### fold = ", fold," RMSE = ", rmse)
    scores.append(rmse)

print("Mean RMSE = ", my_np.mean(scores)," Standard deviation = ", my_np.std(scores))
final_valid_predictions = my_pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "lvl1pred_2"]
final_valid_predictions.to_csv("lvl1_train_pred_2.csv", index=False)

sample_submission.target = my_np.mean(my_np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "lvl1pred_2"]
sample_submission.to_csv("lvl1_test_pred_2.csv", index=False)

In [None]:
#Reinitialization for LightGB model
final_test_predictions = []
final_valid_predictions = {}
scores = []
sample_submission = my_pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
#Level 1 LightGB Pred 3 with tuned parameters
for fold in range(num_of_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #Best tuned parameters based on fold 0
    lgbm_params = {
        'n_estimators': 13103, 
        'max_depth': 1, 
        'learning_rate': 0.01007362838130684, 
        'num_leaves': 46, 
        'feature_fraction': 0.1011590209710771, 
        'bagging_fraction': 0.8409771566248516, 
        'bagging_freq': 1, 
        'max_bin': 954, 
        'subsample': 0.1, 
        'colsample_bytree': 1.0, 
        'reg_alpha': 42.900000000000006, 
        'reg_lambda': 133.2, 
        'min_data_in_leaf': 50, 
        'min_child_samples': 20
    }
    
    LGBM_model = LGBMRegressor(**lgbm_params, random_state=42, n_jobs=-1)
    LGBM_model.fit(xtrain, ytrain, early_stopping_rounds=200, eval_set=[(xvalid, yvalid)], verbose=False)
    preds_valid = LGBM_model.predict(xvalid)
    test_preds = LGBM_model.predict(xtest)
    final_test_predictions.append(test_preds)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print("###LGBM### fold = ", fold," RMSE = ", rmse)
    scores.append(rmse)

print("Mean RMSE = ", my_np.mean(scores)," Standard deviation = ", my_np.std(scores))

final_valid_predictions = my_pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "lvl1pred_3"]
final_valid_predictions.to_csv("lvl1_train_pred_3.csv", index=False)

sample_submission.target = my_np.mean(my_np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ["id", "lvl1pred_3"]
sample_submission.to_csv("lvl1_test_pred_3.csv", index=False)

In [None]:
#Prep for the predicted level 1 results merging and linear regression
df = my_pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = my_pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = my_pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [None]:
df1 = my_pd.read_csv("lvl1_train_pred_1.csv")
df2 = my_pd.read_csv("lvl1_train_pred_2.csv")
df3 = my_pd.read_csv("lvl1_train_pred_3.csv")

df_test1 = my_pd.read_csv("lvl1_test_pred_1.csv")
df_test2 = my_pd.read_csv("lvl1_test_pred_2.csv")
df_test3 = my_pd.read_csv("lvl1_test_pred_3.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")

In [None]:
#Extract the level 1 predicted results cols.
useful_features = ["lvl1pred_1", "lvl1pred_2", "lvl1pred_3"]
df_test = df_test[useful_features]
print(df_test.head())

In [None]:
#Using Ridge Regression model
final_predictions_ridge = []
scores = []
for fold in range(num_of_folds):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    #Tuned Alpha parameter
    model = Ridge(alpha= 3.4583520855110406)
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions_ridge.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print("###Ridge### fold = ", fold," RMSE = ", rmse)
    scores.append(rmse)

print("Mean RMSE = ", my_np.mean(scores)," Standard deviation = ", my_np.std(scores))

In [None]:
#Prepare the final submission file
sample_submission.target = my_np.mean(my_np.column_stack(final_predictions_ridge), axis=1)
sample_submission.to_csv("submission.csv", index=False)