The original dataset deals with predicting the amount of an insurance claim

In [None]:
#Importing libraries
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
import optuna

In [None]:
#Reading the training dataset
df_train = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
df_train.head()

For this regression problem, we'll use K Fold

### Creating 5 Folds

In [None]:
df_train["K_Fold"] = -1
kf = model_selection.KFold(n_splits = 5, shuffle= True, random_state = 42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X = df_train)):
    print(fold, train_indicies, valid_indicies)
    df_train.loc[valid_indicies, "K_Fold"] = fold
    
df_train.to_csv("KFolds_5.csv", index = False)

In [None]:
#Reading the K fold data, test and submission datasets
df = pd.read_csv("./KFolds_5.csv")
df_test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
df_samSub = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")

#### Target Encoding of Categorical Data

In [None]:
useful_features = [uc for uc in df.columns if uc not in ("id","target","K_Fold")]
#Categorical Features
object_cols = [col for col in useful_features if 'cat' in col]

df_test = df_test[useful_features]

for col in object_cols: #go through each column in object columns
    temp_df = [] #empty list to store dataframes (training dfs)
    temp_test_features = None #none feature for test set
    for fold in range(5):
        x_train = df[df.K_Fold != fold].reset_index(drop = True)
        x_valid = df[df.K_Fold == fold].reset_index(drop = True)
        feat_enc = x_train.groupby(col)["target"].agg("mean")
        feat_enc = feat_enc.to_dict()
        x_valid.loc[:, f"tar_enc_{col}"] = x_valid[col].map(feat_enc)
        temp_df.append(x_valid)
        if temp_test_features is None:   #if None i.e first fold - fold 0
            temp_test_features = df_test[col].map(feat_enc)
        else:
            temp_test_features += df_test[col].map(feat_enc)
        
    temp_test_features /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_features
    df = pd.concat(temp_df)

In [None]:
# Updating the features
useful_features = [uc for uc in df.columns if uc not in ("id","target","K_Fold")]
object_cols = [col for col in useful_features if col.startswith('cat')]
numerical_cols = [clm for clm in useful_features if clm.startswith('cont')]
df_test = df_test[useful_features]

#### Hyperparameter Tuning using Optuna

In [None]:
def obj_func(trial):
    fold = 1
    #for fold in range(5):  #running for 5 folds means fitting model 5 time , huge data and time consuming
    learning_rate = trial.suggest_float("learning_rate", 1e-5,0.25, log = True)#from 0.001 to 0.25
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8,100.0)#regularisation lambda
    reg_alpha =trial.suggest_loguniform("reg_alpha",1e-8, 100.0)
    subsample = trial.suggest_float("subsample",0.1,1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree",0.1,1.0)
    max_depth = trial.suggest_int("max_depth",1,7)

    x_train = df[df.K_Fold != fold].reset_index(drop = True)
    x_valid = df[df.K_Fold == fold].reset_index(drop = True)

    y_train = x_train.target
    y_valid = x_valid.target

    x_train = x_train[useful_features]
    x_valid = x_valid[useful_features]

    ord_enc = preprocessing.OrdinalEncoder()
    x_train[object_cols] = ord_enc.fit_transform(x_train[object_cols])
    x_valid[object_cols] = ord_enc.transform(x_valid[object_cols])

    std_scaler = preprocessing.StandardScaler()
    x_train[numerical_cols] = std_scaler.fit_transform(x_train[numerical_cols])
    x_valid[numerical_cols] = std_scaler.transform(x_valid[numerical_cols])

    xgb_model = XGBRegressor(random_state = 42,
                             tree_method = "gpu_hist",
                             gpu_id = 0,
                             predictor = "gpu_predictor",
                             n_estimators = 7000, #keep it low(7k/10k/15k) to get good results
                             learning_rate = learning_rate,
                             reg_lambda = reg_lambda,
                             reg_alpha = reg_alpha,
                             subsample = subsample,
                             colsample_bytree = colsample_bytree,
                             max_depth = max_depth)

    xgb_model.fit(x_train,y_train, early_stopping_rounds=500, eval_set=[(x_valid,y_valid)],verbose = 1000)
    pred_valid = xgb_model.predict(x_valid)
    rsme = mean_squared_error(y_valid,pred_valid, squared = False)
    return rsme

In [None]:
study = optuna.create_study(direction = "minimize")
study.optimize(obj_func, n_trials=7)   
study.best_params

#### Optimized XGBoost 

In [None]:
df = pd.read_csv("./KFolds_5.csv")
df_test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
df_samSub = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")

useful_features = [uc for uc in df.columns if uc not in ("id","target","K_Fold")]
object_cols = [col for col in useful_features if col.startswith('cat')]
numerical_cols = [clm for clm in useful_features if clm.startswith('cont')]
df_test = df_test[useful_features]

final_predictions = []
scores = []

for fold in range(5):
    x_train = df[df.K_Fold != fold].reset_index(drop = True)
    x_valid = df[df.K_Fold == fold].reset_index(drop = True)
    x_test = df_test.copy()
    
    y_train = x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[useful_features]
    x_valid = x_valid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    x_train[object_cols] = ordinal_encoder.fit_transform(x_train[object_cols])
    x_valid[object_cols] = ordinal_encoder.transform(x_valid[object_cols])
    x_test[object_cols] = ordinal_encoder.transform(x_test[object_cols])
    
    #Standardizing the Numerical Data
    std_scaler = preprocessing.StandardScaler()
    x_train[numerical_cols] = std_scaler.fit_transform(x_train[numerical_cols])
    x_valid[numerical_cols] = std_scaler.transform(x_valid[numerical_cols])
    x_test[numerical_cols] = std_scaler.transform(x_test[numerical_cols])
    
    #using the optimized parameters to train the model
    params = {'learning_rate': 0.014521078138264957,
              'reg_lambda': 0.031156935545308233,
              'reg_alpha': 23.3283725457658,
              'subsample': 0.6477780845025514, 
              'colsample_bytree': 0.55158309424606,
              'max_depth': 2}
    
    model = XGBRegressor(random_state = 42,
                         tree_method = "gpu_hist",
                         gpu_id = 0,
                         predictor = "gpu_predictor",
                         n_estimators = 7000,
                         **params)
    
    model.fit(x_train,y_train)
    valid_pred = model.predict(x_valid)
    test_pred = model.predict(x_test)
    final_predictions.append(test_pred)
    rsme = mean_squared_error(y_valid,valid_pred, squared = False)
    #we set squared=False to get the root mean squared error (RMSE) on the validation data.
    print(f"Fold {fold} RSME : {rsme}")
    scores.append(rsme)

print(f"Mean of Scores : {np.mean(scores)}  and Standard Deviation of Scores : {np.std(scores)}")
    

In [None]:
#Creating the submission dataset
df_samSub.target = np.mean(np.column_stack(final_predictions), axis = 1)
df_samSub.to_csv("submission.csv", index = False)