In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression


df = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

df1 = pd.read_csv("../input/ensambleyay/train_pred_xgb.csv")
df2 = pd.read_csv("../input/ensambleyay/train_pred_lgbm.csv")
df3 = pd.read_csv("../input/ensambleyay/train_pred_nn.csv")

df1.rename(columns={"pred_1": "pred_xgb"}, inplace=True)
df2.rename(columns={"pred_1": "pred_lgbm"}, inplace=True)
df3.rename(columns={"pred_1": "pred_nn"}, inplace=True)

df_test1 = pd.read_csv("../input/ensambleyay/test_xgb.csv")
df_test2 = pd.read_csv("../input/ensambleyay/test_lgbm.csv")
df_test3 = pd.read_csv("../input/ensambleyay/test_nn.csv")

df_test1.rename(columns={"claim": "pred_xgb"}, inplace=True)
df_test2.rename(columns={"claim": "pred_lgbm"}, inplace=True)
df_test3.rename(columns={"claim": "pred_nn"}, inplace=True)

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")
df = df.merge(df3, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")
df_test = df_test.merge(df_test3, on="id", how="left")

# df = df.merge(df1, on="id", how="left", rsuffix="_xgb")
# df = df.merge(df2, on="id", how="left", rsuffix="_lgbm")
# df = df.merge(df3, on="id", how="left", rsuffix="_nn")

# df_test = df_test.merge(df_test1, on="id", how="left", rsuffix="_xgb")
# df_test = df_test.merge(df_test2, on="id", how="left", rsuffix="_lgbm")
# df_test = df_test.merge(df_test3, on="id", how="left", rsuffix="_nn")

df.head(10)

In [None]:
df_test.head(10)

# **Straight To Blending**
Tried two models for blending - LinearRegression and XGBRegressor. 
LinearRegressor won.

In [None]:
# from sklearn.linear_model import LinearRegression
# from xgboost import XGBRegressor
# useful_features = ["pred_xgb", "pred_lgbm"]
# test = df_test[useful_features]

# final_preds = []

# X_train = df[useful_features]
# y_train = df.claim

# model = LinearRegression()
# model.fit(X_train, y_train)

# # model = XGBRegressor(n_estimators=100)
# # model.fit(X_train, y_train)

# predictions = model.predict(test)

# **Blending but 10-fold**
This cell predicts with 10 weaker models trained on 10 folds rather than one model that trains on the entire training set. This can avoid overfitting.

In [None]:
# useful_features = ["pred_xgb", "pred_lgbm"]
# test = df_test[useful_features]
# X = df[useful_features]
# y = df.claim

# kfold = StratifiedKFold(n_splits = 10, random_state=42, shuffle=True)
# preds = [0.0]*test.shape[0]
# for idx in kfold.split(X=X, y=y):
#     train_idx, val_idx = idx[0], idx[1]
#     X_train = X.iloc[train_idx]
#     y_train = y.iloc[train_idx]
#     X_val = X.iloc[val_idx]
#     y_val = y.iloc[val_idx]
    
#     model = LinearRegression()
#     model.fit(X_train, y_train)
    
#     val_preds = model.predict(X_val)
#     print(roc_auc_score(y_val, val_preds))
#     predictions = model.predict(test)
#     print(predictions.shape, predictions[:10])
#     preds += predictions / kfold.n_splits
# print("final", preds[:10])

# **Choosing the models**
Even though the initial plan was to use three models, the vanilla dense neural network model always dropped the score. So, the new plan was to discard the predictions made by neural networks. These will be the Level-0 models.

In [None]:
useful_features = ["pred_xgb", "pred_lgbm"]
test = df_test[useful_features]
X = df[useful_features]
y = df.claim

# **Looking at the predictions**
If we take a peek at the variance coverage, we can see that there not much difference between the two predictions.

In [None]:
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)


def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

pca = PCA()
X_pca = pca.fit_transform(X)
component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

print(X.head())
print(X_pca.head())

In [None]:
loadings = pd.DataFrame(
    pca.components_.T,  # transpose the matrix of loadings
    columns=component_names,  # so the columns are the principal components
    index=X.columns,  # and the rows are the original features
)
loadings

In [None]:
plot_variance(pca)

In [None]:
# mi_scores = make_mi_scores(X_pca, y, discrete_features=False)
# mi_scores

# **GridSearch for Level-1 models**

**The first model is a Histogram Gradient Boosted Regreesor, a lightweight model for making predictions on only two features. The visualization for the parameter searching is also generated.**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

estimator = HistGradientBoostingRegressor()
param_grid = {"max_iter" : [100, 200, 300, 400, 500, 600], 
              "learning_rate" : [1e-1, 1e-2, 1e-3]
             }

my_scorer = make_scorer(roc_auc_score)
grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5, verbose=10, scoring = my_scorer)
grid.fit(X, y)

print(grid.best_score_ , grid.best_params_)

In [None]:
from sklearn.svm import SVC
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
    # Get Test Scores Mean and std for each grid search
    scores_mean = cv_results['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))

    scores_sd = cv_results['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))

    # Plot Grid search scores
    _, ax = plt.subplots(1,1)

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):
        ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))

    ax.set_title("Grid Search Scores", fontsize=20, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=16)
    ax.set_ylabel('CV Average Score', fontsize=16)
    ax.legend(loc="best", fontsize=15)
    ax.grid('on')

# Calling Method 
plot_grid_search(grid.cv_results_, [100, 200, 300, 400, 500, 600] , [1e-1, 1e-2, 1e-3], 'max_iters', 'learning rate')

**With the optimal parameters, validation and test predictions are made with a 10-fold distribution for the final blending.**

In [None]:
# useful_features = ["pred_xgb", "pred_lgbm", "pred_nn"]
useful_features = ["pred_xgb", "pred_lgbm"]
test = df_test[useful_features]
X = df[useful_features]
y = df.claim

kfold = StratifiedKFold(n_splits = 10, random_state=41, shuffle=True)
preds = [0.0]*test.shape[0]
final_test_predictions = []
final_valid_predictions = {}
for idx in kfold.split(X=X, y=y):
    train_idx, val_idx = idx[0], idx[1]
    X_train = X.iloc[train_idx].reset_index(drop=True)
    y_train = y.iloc[train_idx].reset_index(drop=True)
    X_val = X.iloc[val_idx].reset_index(drop=True)
    y_val = y.iloc[val_idx].reset_index(drop=True)
    
    model = HistGradientBoostingRegressor(max_iter = 200, learning_rate = 0.01)
    model.fit(X_train, y_train)
    val_preds = model.predict(X_val)
    predictions = model.predict(test)
    print(roc_auc_score(y_val, val_preds), predictions[:10])
    
    final_test_predictions.append(predictions)
    final_valid_predictions.update(dict(zip(val_idx, val_preds)))
    
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_L2_1.csv", index=False)

submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submission.claim = np.mean(np.column_stack(final_test_predictions), axis=1)
submission.to_csv("test_L2_1.csv", index=False)

print("final", predictions[:10])

# **GridSearch and visualization for Level-1 XGBRegressor**

In [None]:
estimator = XGBRegressor(tree_method="gpu_hist",
                         predictor="gpu_predictor",
                        random_state = 41)

param_grid = {"n_estimators" : [50, 100, 150, 200, 250, 300], 
              "learning_rate" : [1e-1, 1e-2, 1e-3]
             }

my_scorer = make_scorer(roc_auc_score)
grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=5, verbose=10, scoring = my_scorer)
grid.fit(X, y)

print(grid.best_score_ , grid.best_params_)

In [None]:
plot_grid_search(grid.cv_results_, [50, 100, 150, 200, 250, 300], [1e-1, 1e-2, 1e-3], 'n_estimators', 'learning rate')

In [None]:
useful_features = ["pred_xgb", "pred_lgbm"]
test = df_test[useful_features]
X = df[useful_features]
y = df.claim

kfold = StratifiedKFold(n_splits = 10, random_state=41, shuffle=True)
preds = [0.0]*test.shape[0]
final_test_predictions = []
final_valid_predictions = {}
for idx in kfold.split(X=X, y=y):
    train_idx, val_idx = idx[0], idx[1]
    X_train = X.iloc[train_idx].reset_index(drop=True)
    y_train = y.iloc[train_idx].reset_index(drop=True)
    X_val = X.iloc[val_idx].reset_index(drop=True)
    y_val = y.iloc[val_idx].reset_index(drop=True)
    
    model = XGBRegressor(n_estimators=150,
                         learning_rate = 0.01,
                         random_state = 41,
                         tree_method="gpu_hist",
                         predictor="gpu_predictor",
                         verbosity=0, n_jobs=-1)
    model.fit(X_train, y_train)
    val_preds = model.predict(X_val)
    predictions = model.predict(test)
    print(roc_auc_score(y_val, val_preds), predictions[:10])
    
    final_test_predictions.append(predictions)
    final_valid_predictions.update(dict(zip(val_idx, val_preds)))
    
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_L2_2.csv", index=False)

submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submission.claim = np.mean(np.column_stack(final_test_predictions), axis=1)
submission.to_csv("test_L2_2.csv", index=False)

print("final", predictions[:10])

In [None]:
df = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

df1 = pd.read_csv("train_pred_L2_1.csv")
df2 = pd.read_csv("train_pred_L2_2.csv")

df1.rename(columns={"pred_1": "pred_hist"}, inplace=True)
df2.rename(columns={"pred_1": "pred_xgb2"}, inplace=True)

df_test1 = pd.read_csv("test_L2_1.csv")
df_test2 = pd.read_csv("test_L2_2.csv")

df_test1.rename(columns={"claim": "pred_hist"}, inplace=True)
df_test2.rename(columns={"claim": "pred_xgb2"}, inplace=True)

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")

In [None]:
df.head(10)

In [None]:
df_test.head(10)

# **Final Blending**

In [None]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
# useful_features = ["pred_xgb", "pred_lgbm", "pred_nn"]
useful_features = ["pred_hist", "pred_xgb2"]
test = df_test[useful_features]

final_preds = []

X_train = df[useful_features]
y_train = df.claim

model = LinearRegression()
model.fit(X_train, y_train)

# model = XGBRegressor(n_estimators=100)
# model.fit(X_train, y_train)

predictions = model.predict(test)

# **Final Blending but 10-fold**

In [None]:
# useful_features = ["pred_xgb", "pred_lgbm", "pred_nn"]
useful_features = ["pred_hist", "pred_xgb2"]
test = df_test[useful_features]
X = df[useful_features]
y = df.claim

kfold = StratifiedKFold(n_splits = 10, random_state=42, shuffle=True)
predictions = [0.0]*test.shape[0]
for idx in kfold.split(X=X, y=y):
    train_idx, val_idx = idx[0], idx[1]
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    val_preds = model.predict(X_val)
    print(roc_auc_score(y_val, val_preds))
    preds = model.predict(test)
    print(preds.shape, predictions[:10])
    predictions += preds / kfold.n_splits
print("final", preds[:10])

In [None]:
# submission = pd.read_csv("../input/submissions-ensemble/submission_nn.csv")
sample_submission.claim = predictions
sample_submission.to_csv("submission_ens_stack3.csv", index=False)
sample_submission