## Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression

## Read the data files

In [None]:
train = pd.read_csv('../input/tpssep2021dataset10folds/train_10_folds.csv', index_col='id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv', index_col='id')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
print(test.shape)
print(train.shape)
print(submission.shape)

## Introducing Additional Features

In [None]:
# Adding the number of missing values in a row as a feature increases the score significantly
train["missing_value_cnt"] = train.isnull().sum(axis=1)
test["missing_value_cnt"] = test.isnull().sum(axis=1)

## Read the Prediction Files

In [None]:
xgb_train_preds = pd.read_csv("../input/tps-sep2021-model-predictions-for-blending/XGB_train_predictions.csv")
lgbm_train_preds = pd.read_csv("../input/tps-sep2021-model-predictions-for-blending/LGBM_train_predictions.csv")

xgb_test_preds = pd.read_csv("../input/tps-sep2021-model-predictions-for-blending/XGB_test_predictions.csv")
lgbm_test_preds = pd.read_csv("../input/tps-sep2021-model-predictions-for-blending/LGBM_test_predictions.csv")

train = train.merge(xgb_train_preds, on="id", how="left")
train = train.merge(lgbm_train_preds, on="id", how="left")

test = test.merge(xgb_test_preds, on="id", how="left")
test = test.merge(lgbm_test_preds, on="id", how="left")

In [None]:
train.head()

In [None]:
test.head()

## Training model with 10 Fold Cross Validation

In [None]:
models = ['XGB', 'LGBM']
all_test_predictions = []
auc_scores = []

for fold in range(10):
    X_train =  train[train.fold != fold]
    X_valid = train[train.fold == fold]
    X_test = test.copy()

    y_train = X_train.claim
    y_valid = X_valid.claim
    
    X_train = X_train.loc[:, ['{}_preds'.format(model) for model in models]]
    X_valid = X_valid.loc[:, ['{}_preds'.format(model) for model in models]]
    X_test = X_test.loc[:, ['{}_preds'.format(model) for model in models]]
    
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    valid_preds = model.predict_proba(X_valid)[:, 1]
    test_preds = model.predict_proba(X_test)[:, 1]
    all_test_predictions.append(test_preds)
    roc_auc = roc_auc_score(y_valid, valid_preds)
    print("Validation score for fold {}: {}".format(fold, roc_auc))
    auc_scores.append(roc_auc)

print("Validation scores mean : {} and Standard deviation : {}".format(np.mean(auc_scores), np.std(auc_scores)))

In [None]:
submission.claim = np.mean(np.array(all_test_predictions), axis=0)
submission.to_csv("blending_output_kfold_cv.csv", index=False)

In [None]:
print(submission.shape)
submission.head()

## Training Model with Whole Training Data

In [None]:
# models = ['XGB', 'LGBM']

# X_train = train.copy()
# X_test = test.copy()

# y_train = train.claim
# X_train = X_train.loc[:, ['{}_preds'.format(model) for model in models]]
# X_test = X_test.loc[:, ['{}_preds'.format(model) for model in models]]

In [None]:
# model = LinearRegression()
# model.fit(X_train, y_train)
# test_preds = model.predict(X_test)

In [None]:
# print(roc_auc_score(y_train, model.predict(X_train)))

## Submission

In [None]:
# submission['claim'] = test_preds
# submission.to_csv('blending_output_with_whole_data.csv', index = False)