## Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression

# explicitly require this experimental feature
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
# now you can import normally from ensemble
from sklearn.ensemble import HistGradientBoostingRegressor

## Read the data files

In [None]:
train = pd.read_csv('../input/tpssep2021dataset10folds/train_10_folds.csv', index_col='id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv', index_col='id')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
print(test.shape)
print(train.shape)
print(submission.shape)

## Introducing Additional Features

In [None]:
# Adding the number of missing values in a row as a feature increases the score significantly
train["missing_value_cnt"] = train.isnull().sum(axis=1)
test["missing_value_cnt"] = test.isnull().sum(axis=1)

# Stacking Level 1

## Read the Prediction Files

In [None]:
xgb_train_preds = pd.read_csv("../input/tps-sep2021-model-predictions-for-blending/XGB_train_predictions.csv")
lgbm_train_preds = pd.read_csv("../input/tps-sep2021-model-predictions-for-blending/LGBM_train_predictions.csv")

xgb_test_preds = pd.read_csv("../input/tps-sep2021-model-predictions-for-blending/XGB_test_predictions.csv")
lgbm_test_preds = pd.read_csv("../input/tps-sep2021-model-predictions-for-blending/LGBM_test_predictions.csv")

train = train.merge(xgb_train_preds, on="id", how="left")
train = train.merge(lgbm_train_preds, on="id", how="left")

test = test.merge(xgb_test_preds, on="id", how="left")
test = test.merge(lgbm_test_preds, on="id", how="left")

In [None]:
train.head()

In [None]:
test.head()

## Training XGBRegressor model with 10 Fold Cross Validation

In [None]:
level_0_models = ['XGB', 'LGBM']
all_test_predictions = []
valid_predictions = pd.DataFrame(np.zeros(train.index.shape), index = train.index, columns=['XGB_preds_level_1'])
auc_scores = []

for fold in range(10):
    X_train =  train[train.fold != fold]
    X_valid = train[train.fold == fold]
    X_test = test.copy()
    
    valid_ids = X_valid.index.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim
    
    X_train = X_train.loc[:, ['{}_preds'.format(model) for model in level_0_models]]
    X_valid = X_valid.loc[:, ['{}_preds'.format(model) for model in level_0_models]]
    X_test = X_test.loc[:, ['{}_preds'.format(model) for model in level_0_models]]
    
    model = XGBRegressor(tree_method="gpu_hist",
                         random_state=1234,
                         predictor="gpu_predictor")

    model.fit(X_train, y_train)
    valid_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)
    all_test_predictions.append(test_preds)
    valid_predictions.loc[valid_ids, 'XGB_preds_level_1'] = valid_preds
    
    roc_auc = roc_auc_score(y_valid, valid_preds)
    print("Validation score for fold {}: {}".format(fold, roc_auc))
    auc_scores.append(roc_auc)

print("Validation scores mean : {} and Standard deviation : {}".format(np.mean(auc_scores), np.std(auc_scores)))

In [None]:
valid_predictions = valid_predictions.reset_index()
valid_predictions.columns = ["id", "XGB_preds_level_1"]
valid_predictions.to_csv("XGB_train_predictions_level_1.csv", index=False)

print(valid_predictions.shape)
valid_predictions.head()

In [None]:
test_predictions = submission.copy()
test_predictions.claim = np.mean(np.array(all_test_predictions), axis=0)
test_predictions.columns = ["id", "XGB_preds_level_1"]
test_predictions.to_csv("XGB_test_predictions_level_1.csv", index=False)

print(test_predictions.shape)
test_predictions.head()

## Training Random Forest model with 10 Fold Cross Validation

In [None]:
# level_0_models = ['XGB', 'LGBM']
# all_test_predictions = []
# valid_predictions = pd.DataFrame(np.zeros(train.index.shape), index = train.index, columns=['RF_preds_level_1'])
# auc_scores = []

# for fold in range(10):
#     X_train =  train[train.fold != fold]
#     X_valid = train[train.fold == fold]
#     X_test = test.copy()
    
#     valid_ids = X_valid.index.tolist()

#     y_train = X_train.claim
#     y_valid = X_valid.claim
    
#     X_train = X_train.loc[:, ['{}_preds'.format(model) for model in level_0_models]]
#     X_valid = X_valid.loc[:, ['{}_preds'.format(model) for model in level_0_models]]
#     X_test = X_test.loc[:, ['{}_preds'.format(model) for model in level_0_models]]
    
#     model = RandomForestRegressor(random_state=1234, verbose = 10, n_jobs = -1)

#     model.fit(X_train, y_train)
#     valid_preds = model.predict(X_valid)
#     test_preds = model.predict(X_test)
#     all_test_predictions.append(test_preds)
#     valid_predictions.loc[valid_ids, 'RF_preds_level_1'] = valid_preds
    
#     roc_auc = roc_auc_score(y_valid, valid_preds)
#     print("Validation score for fold {}: {}".format(fold, roc_auc))
#     auc_scores.append(roc_auc)

# print("Validation scores mean : {} and Standard deviation : {}".format(np.mean(auc_scores), np.std(auc_scores)))

In [None]:
# valid_predictions = valid_predictions.reset_index()
# valid_predictions.columns = ["id", "RF_preds_level_1"]
# valid_predictions.to_csv("RF_train_predictions_level_1.csv", index=False)

# print(valid_predictions.shape)
# valid_predictions.head()

In [None]:
# test_predictions = submission.copy()
# test_predictions.claim = np.mean(np.array(all_test_predictions), axis=0)
# test_predictions.columns = ["id", "RF_preds_level_1"]
# test_predictions.to_csv("RF_test_predictions_level_1.csv", index=False)

# print(test_predictions.shape)
# test_predictions.head()

## Training Logistic Regression model with 10 Fold Cross Validation

In [None]:
level_0_models = ['XGB', 'LGBM']
all_test_predictions = []
valid_predictions = pd.DataFrame(np.zeros(train.index.shape), index = train.index, columns=['LoR_preds_level_1'])
auc_scores = []

for fold in range(10):
    X_train =  train[train.fold != fold]
    X_valid = train[train.fold == fold]
    X_test = test.copy()
    
    valid_ids = X_valid.index.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim
    
    X_train = X_train.loc[:, ['{}_preds'.format(model) for model in level_0_models]]
    X_valid = X_valid.loc[:, ['{}_preds'.format(model) for model in level_0_models]]
    X_test = X_test.loc[:, ['{}_preds'.format(model) for model in level_0_models]]
    
    model = LogisticRegression(max_iter = 1000, random_state=1234, n_jobs = -1)

    model.fit(X_train, y_train)
    valid_preds = model.predict_proba(X_valid)[:, 1]
    test_preds = model.predict_proba(X_test)[:, 1]
    all_test_predictions.append(test_preds)
    valid_predictions.loc[valid_ids, 'LoR_preds_level_1'] = valid_preds
    
    roc_auc = roc_auc_score(y_valid, valid_preds)
    print("Validation score for fold {}: {}".format(fold, roc_auc))
    auc_scores.append(roc_auc)

print("Validation scores mean : {} and Standard deviation : {}".format(np.mean(auc_scores), np.std(auc_scores)))

In [None]:
valid_predictions = valid_predictions.reset_index()
valid_predictions.columns = ["id", "LoR_preds_level_1"]
valid_predictions.to_csv("LoR_train_predictions_level_1.csv", index=False)

print(valid_predictions.shape)
valid_predictions.head()

In [None]:
test_predictions = submission.copy()
test_predictions.claim = np.mean(np.array(all_test_predictions), axis=0)
test_predictions.columns = ["id", "LoR_preds_level_1"]
test_predictions.to_csv("LoR_test_predictions_level_1.csv", index=False)

print(test_predictions.shape)
test_predictions.head()

# Stacking Level 2

## Read the Predictions From Stacking Level 1

In [None]:
# Drop the columns containing model predictions for level 0
# from the original train and test dataset
level_0_models = ['XGB', 'LGBM']
train = train.drop(columns = ['{}_preds'.format(model) for model in level_0_models])
test = test.drop(columns = ['{}_preds'.format(model) for model in level_0_models])

xgb_train_preds = pd.read_csv("XGB_train_predictions_level_1.csv")
# rf_train_preds = pd.read_csv("RF_train_predictions_level_1.csv")
lor_train_preds = pd.read_csv("LoR_train_predictions_level_1.csv")

xgb_test_preds = pd.read_csv("XGB_test_predictions_level_1.csv")
# rf_test_preds = pd.read_csv("RF_test_predictions_level_1.csv")
lor_test_preds = pd.read_csv("LoR_test_predictions_level_1.csv")

train = train.merge(xgb_train_preds, on="id", how="left")
# train = train.merge(rf_train_preds, on="id", how="left")
train = train.merge(lor_train_preds, on="id", how="left")

test = test.merge(xgb_test_preds, on="id", how="left")
# test = test.merge(rf_test_preds, on="id", how="left")
test = test.merge(lor_test_preds, on="id", how="left")

## Training Linear Regression model with 10 Fold Cross Validation

In [None]:
level_1_models = ['XGB', 'LoR']
all_test_predictions = []
auc_scores = []

for fold in range(10):
    X_train =  train[train.fold != fold]
    X_valid = train[train.fold == fold]
    X_test = test.copy()

    y_train = X_train.claim
    y_valid = X_valid.claim
    
    X_train = X_train.loc[:, ['{}_preds_level_1'.format(model) for model in level_1_models]]
    X_valid = X_valid.loc[:, ['{}_preds_level_1'.format(model) for model in level_1_models]]
    X_test = X_test.loc[:, ['{}_preds_level_1'.format(model) for model in level_1_models]]
    
    model = LogisticRegression(max_iter = 1000, random_state=1234, n_jobs = -1)
    model.fit(X_train, y_train)
    
    valid_preds = model.predict_proba(X_valid)[:, 1]
    test_preds = model.predict_proba(X_test)[:, 1]
    all_test_predictions.append(test_preds)
    roc_auc = roc_auc_score(y_valid, valid_preds)
    print("Validation score for fold {}: {}".format(fold, roc_auc))
    auc_scores.append(roc_auc)

print("Validation scores mean : {} and Standard deviation : {}".format(np.mean(auc_scores), np.std(auc_scores)))

In [None]:
submission.claim = np.mean(np.array(all_test_predictions), axis=0)
submission.to_csv("stacking_output_kfold_cv.csv", index=False)

In [None]:
print(submission.shape)
submission.head()

## Training Model with Whole Training Data

In [None]:
# level_1_models = ['XGB', 'RF']

# X_train = train.copy()
# X_test = test.copy()

# y_train = train.claim
# X_train = X_train.loc[:, ['{}_preds_level_1'.format(model) for model in level_1_models]]
# X_test = X_test.loc[:, ['{}_preds_level_1'.format(model) for model in level_1_models]]

In [None]:
# model = LinearRegression()
# model.fit(X_train, y_train)
# test_preds = model.predict(X_test)

In [None]:
# print(roc_auc_score(y_train, model.predict(X_train)))

## Submission

In [None]:
# submission['claim'] = test_preds
# submission.to_csv('stacking_output_with_whole_data.csv', index = False)