## Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

## Read the data files

In [None]:
train = pd.read_csv('../input/tpssep2021dataset10folds/train_10_folds.csv', index_col='id')
print(train.shape)
train.head()

In [None]:
train.describe()

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/test.csv', index_col='id')
print(test.shape)
test.head()

In [None]:
test.describe()

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2021/sample_solution.csv')
submission.head()

## Introducing Additional Features

In [None]:
# Adding the number of missing values in a row as a feature increases the score significantly
train["missing_value_cnt"] = train.isnull().sum(axis=1)
test["missing_value_cnt"] = test.isnull().sum(axis=1)

train.head()

## Imputation for Handling Missing Values

In [None]:
def imputation(X_train, X_valid, X_test = None):
    imputer = SimpleImputer(strategy='mean')

    imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
    imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))
    imputed_X_test = None

    # Imputation removed column names; put them back
    imputed_X_train.columns = X_train.columns
    imputed_X_valid.columns = X_valid.columns
    
    if X_test is not None:
        imputed_X_test = pd.DataFrame(imputer.transform(X_test))
        imputed_X_test.columns = X_test.columns
    
    return imputed_X_train, imputed_X_valid, imputed_X_test


## Feature Scaling

In [None]:
def feature_scaling(X_train, X_valid, X_test = None):
    standardScaler = StandardScaler()
    
    scaled_X_train = pd.DataFrame(standardScaler.fit_transform(X_train))
    scaled_X_valid = pd.DataFrame(standardScaler.transform(X_valid))
    scaled_X_test = None
    
    # Scaling removed column names; put them back
    scaled_X_train.columns = X_train.columns
    scaled_X_valid.columns = X_valid.columns
    
    
    if X_test is not None:
        scaled_X_test = pd.DataFrame(standardScaler.transform(X_test))
        scaled_X_test.columns = X_test.columns
    
    return scaled_X_train, scaled_X_valid, scaled_X_test

## HyperParameter Tuning using Optuna

In [None]:
# def objective(trial):
    
#     y = train.claim
#     X = train.drop(columns = ['claim'])
#     X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, random_state = 1234)
    
#     # Perform imputation
#     imputed_X_train, imputed_X_valid, _ = imputation(X_train, X_valid)
    
#     # Perform Feature Scaling
#     scaled_X_train, scaled_X_valid, _ = feature_scaling(imputed_X_train, imputed_X_valid)  

#     X_train = scaled_X_train.copy()
#     X_valid = scaled_X_valid.copy()

#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 300, 10000),
#         "max_depth": trial.suggest_int("max_depth", 3, 12),
#         "min_child_weight": trial.suggest_int("min_child_weight", 5, 12),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
#         "gamma": trial.suggest_float("gamma", 1e-9, 1e-5, log=True),
#         "reg_lambda": trial.suggest_float("lambda", 1e-8, 1e-1, log=True),
#         "reg_alpha": trial.suggest_float("alpha", 1e-8, 1e-1, log=True),
#         "subsample": trial.suggest_float("subsample", 0.2, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
#         "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
#     }

#     xgb_regressor = XGBRegressor(**params,
#                                  tree_method="gpu_hist",
#                                  random_state=1234,
#                                  gpu_id=0,
#                                  predictor="gpu_predictor",
#                                  verbosity=0)

#     pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-rmse")
    
#     xgb_regressor.fit(X_train, 
#                       y_train,
#                       eval_set=[(X_valid, y_valid)],
#                       eval_metric="rmse",
#                       early_stopping_rounds=15,
#                       verbose=False,
#                       callbacks=[pruning_callback])

#     y_pred = xgb_regressor.predict(X_valid)
#     roc_auc = roc_auc_score(y_valid, y_pred)

#     return roc_auc

In [None]:
# pruner = optuna.pruners.MedianPruner(n_warmup_steps=15)
# study = optuna.create_study(pruner= pruner, study_name="xgbr-study", direction="maximize")
# study.optimize(objective, n_trials=100, timeout=3600)

In [None]:
# print("Number of finished trials: ", len(study.trials))
# trial = study.best_trial
# print("Best trial validation score: {}".format(trial.value))

# print("The best parameters are: ")
# study.best_params

In [None]:
best_params = {'n_estimators': 9727,
 'max_depth': 5,
 'min_child_weight': 6,
 'learning_rate': 0.011278075450219378,
 'gamma': 1.9201053461331828e-07,
 'lambda': 3.2282518444851405e-06,
 'alpha': 1.2871612752393361e-06,
 'subsample': 0.604103572661558,
 'colsample_bytree': 0.9805549632981628,
 'grow_policy': 'lossguide'}

print(best_params)

## Training model with 10 Fold Cross Validation

In [None]:
all_test_predictions = []
valid_predictions = pd.DataFrame(np.zeros(train.index.shape), index = train.index, columns=['XGB_preds'])
# print(valid_predictions.shape)
auc_scores = []

for fold in range(10):
    X_train =  train[train.fold != fold]
    X_valid = train[train.fold == fold]
    X_test = test.copy()
    
    valid_ids = X_valid.index.tolist()

    y_train = X_train.claim
    y_valid = X_valid.claim
    
    X_train = X_train.drop(columns=['claim', 'fold'])
    X_valid = X_valid.drop(columns=['claim', 'fold'])
    
    # Perform imputation
    X_train, X_valid, X_test = imputation(X_train, X_valid, X_test)

    # Perform Feature Scaling
    X_train, X_valid, X_test = feature_scaling(X_train, X_valid, X_test) 
    
    model = XGBRegressor(**best_params,
                         verbosity = 0,
                         tree_method="gpu_hist",
                         random_state=1234,
                         predictor="gpu_predictor")

    model.fit(X_train, y_train)
    valid_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)
    all_test_predictions.append(test_preds)
    valid_predictions.loc[valid_ids, 'XGB_preds'] = valid_preds
    
    roc_auc = roc_auc_score(y_valid, valid_preds)
    print("Validation score for fold {}: {}".format(fold, roc_auc))
    auc_scores.append(roc_auc)

print("Validation scores mean : {} and Standard deviation : {}".format(np.mean(auc_scores), np.std(auc_scores)))

In [None]:
valid_predictions = valid_predictions.reset_index()
valid_predictions.columns = ["id", "XGB_preds"]
valid_predictions.to_csv("XGB_train_predictions.csv", index=False)

In [None]:
print(valid_predictions.shape)
valid_predictions.head()

In [None]:
submission.claim = np.mean(np.array(all_test_predictions), axis=0)
# submission.columns = ["id", "XGB_preds"]
submission.to_csv("XGB_test_predictions.csv", index=False)

In [None]:
print(submission.shape)
submission.head()

## Training Model with Whole Training Data

In [None]:
# X_train = train.copy()
# X_test = test.copy()

# y_train = train.claim
# X_train = X_train.drop(columns = ['claim'])

# # Perform imputation
# imputed_X_train, imputed_X_test, _ = imputation(X_train, X_test)

# # Perform Feature Scaling
# scaled_X_train, scaled_X_test, _ = feature_scaling(imputed_X_train, imputed_X_test) 

In [None]:
# model = XGBRegressor(**best_params,
#                      verbosity = 0,
#                      tree_method="gpu_hist",
#                      random_state=1234,
#                      predictor="gpu_predictor")

# model.fit(scaled_X_train, y_train)
# test_predictions = model.predict(scaled_X_test)

In [None]:
# print(roc_auc_score(y_train, model.predict(scaled_X_train)))

## Submission

In [None]:
# submission['claim'] = test_predictions
# submission.to_csv('rf_output.csv', index = False)