# Extreme Gradient Boosting Model testing
Aim of this notebook is to review the extreme gradient boosting model which can be used during a binary classification challenge.

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Extreme Gradient Boosting

In [None]:
# Import modules for model analysis
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# Import xgb modules
import xgboost as xgb

In [None]:
# Read in the data
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv',index_col=0)
test  = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv', index_col=0)

train.head()

In [None]:
# Check the memory consumed by the DataFrame
train.info(memory_usage='deep')

In [None]:
# Memory usage by variable in MB
train.memory_usage(deep=True) * 1e-6

In [None]:
# Lets reduce the memory usage of the features
# First - check the integer values and downcast
def int_downcast(df):
    int_cols = df.select_dtypes(include=['int64'])

    for col in int_cols.columns:
        print(col, 'min:',df[col].min(),'; max:',df[col].max())
        df[col] = pd.to_numeric(df[col], downcast ='integer')
    return df

int_downcast(train)
train.memory_usage(deep=True) * 1e-6

In [None]:
# Second - check the float values and downcast. Method will have to be applied to the train and test DataFrames
def float_downcast(df):
    float_cols = df.select_dtypes(include=['float64'])

    for col in float_cols.columns:
#         print(col, 'min:',df[col].min(),'; max:',df[col].max())
        df[col] = pd.to_numeric(df[col], downcast ='float')
    return df

float_downcast(train)
float_downcast(test)

In [None]:
# Check the memory usage by feature
train.memory_usage(deep=True) * 1e-6
test.memory_usage(deep=True) * 1e-6

In [None]:
# Review the memory usage by DataFrame
train.info(memory_usage='deep')
test.info(memory_usage='deep')

# Missing value treatment

In [None]:
# Check for missing values
train.isnull().sum()
test.isnull().sum()

# Add a dummy missing value for a row with missing data
features = [x for x in train.columns.values if x[0]=="f"]
train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

# Model Analysis

In [None]:
X = train.drop('claim', axis=1)
y = train['claim']

In [None]:
# Impute the missing value as the median value

# Create function for the missing value review
def impute_miss_values(df, strategy='median'):
    
    # List of column names for review
    column_names = [col for col in df.columns]
    
    # create the imputer, the strategy can be mean and median.
    imputer = SimpleImputer(missing_values=np.nan, strategy=strategy)

    # fit the imputer to the train data
    imputer.fit(df)

    # apply the transformation to the train and test
    df_out = pd.DataFrame(imputer.transform(df), columns=column_names)
    return df_out

In [None]:
# Impute missing value for the X and test DataFrames
X = impute_miss_values(X)
test = impute_miss_values(test)

In [None]:
# Prepare the data to be used within the model. Make use of the lgb.Dataset() method to optimise the memory usage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6, stratify=y)

# Baseline model

In [None]:
# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(objective='binary:logistic', 
                          n_estimators=10, 
                          seed=123, 
                          use_label_encoder=False, 
                          eval_metric='auc', 
                          tree_method='gpu_hist')

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

In [None]:
# Evaluate models
def eval_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return roc_auc_score(y_test, y_pred)

In [None]:
eval_model(xg_cl)

In [None]:
# Lets use the boosting and inbuild CV methods

# Create the DMatrix from X and y: churn_dmatrix
d_train = xgb.DMatrix(data=X_train, label=y_train)
d_test = xgb.DMatrix(data=X_test, label=y_test)
xgd_test = xgb.DMatrix(data=test)

# Create the parameter dictionary: params. NOTE: have to explicitly provide the objective param
params = {"objective":"binary:logistic", 
          "max_depth":3,
#           "use_label_encoder":False, 
          "eval_metric":'auc', 
          "tree_method":'gpu_hist'
         }

# Reviewing the AUC metric
# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=d_train, params=params,
                  nfold=3, num_boost_round=10, 
                  metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print((cv_results["test-auc-mean"]).iloc[-1])

In [None]:
# Review the train method
params = {
    "objective": "binary:logistic", 
    "max_depth": 3,
    "eval_metric": 'auc', 
    "tree_method": 'gpu_hist'
}

# train - verbose_eval option switches off the log outputs
xgb_clf = xgb.train(
    params,
    d_train,
    num_boost_round=5000,
    evals=[(d_train, 'train'), (d_test, 'test')],
    early_stopping_rounds=100,
    verbose_eval=0
)

# predict
y_pred = xgb_clf.predict(d_test)
# Compute and print metrics
print(f"AUC : {roc_auc_score(y_test, y_pred)}")

# Make submission

In [None]:
def submission_sample(model, df_test, model_name):
    sample = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
    sample['claim'] = model.predict(df_test)
    return sample.to_csv(f'submission_{model_name}.csv',index=False)

In [None]:
# Baseline submission - original code versions
# submission_sample(xgb_clf, xgd_test, 'xgb_base')

### Perform Hyperparameter tuning

In [None]:
# Max_depth - maximum number of nodes from root to leaves. Larger the more complex the model will be.
# Min_child_weight - minimum weight required to create a new node

# params_grid = {
#     (max_depth, min_child_weight)
#     for max_depth in np.arange(3, 11, 1)
#     for min_child_weight in np.arange(5, 9, 1)
# }

# # Create the parameter dictionary: params.
# params = {"objective":"binary:logistic", 
#           "eval_metric":'auc', 
#           "tree_method":'gpu_hist'
#          }

# # Define initial best params and MAE
# auc_mean = float("Inf")
# best_params = None

# for max_depth, min_child_weight in params_grid:
#     print(f'max_depth: {max_depth} & min_child_weight {min_child_weight}')
#     params['max_depth'] = max_depth
#     params['min_child_weight'] = min_child_weight
#     # Reviewing the AUC metric
#     # Perform cross_validation: cv_results
#     cv_results = xgb.cv(dtrain=d_train, params=params,
#                       nfold=3, num_boost_round=10, 
#                       metrics="auc", as_pandas=True, seed=123)

#     # Print the AUC
#     print((cv_results["test-auc-mean"]).iloc[-1])
#     # Update best AUC
#     mean_auc = cv_results["test-auc-mean"].iloc[-1]
#     if mean_auc > auc_mean:
#         auc_mean = mean_auc
#         best_params = (max_depth, min_child_weight)
# print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], auc_mean))

In [None]:
# Review the train method
params = {
    "objective": "binary:logistic", 
    "eval_metric": 'auc', 
    "tree_method": 'gpu_hist',
    "max_depth": 3,
    "min_child_weight": 4,
#     "subsample": .8
    "eta": 0.05
}

# train - verbose_eval option switches off the log outputs
xgb_clf = xgb.train(
    params,
    d_train,
    num_boost_round=5000,
    evals=[(d_train, 'train'), (d_test, 'test')],
    early_stopping_rounds=100,
    verbose_eval=0
)

# predict
y_pred = xgb_clf.predict(d_test)
# Compute and print metrics
print(f"AUC : {roc_auc_score(y_test, y_pred)}")

In [None]:
# Adjust ETA submission
submission_sample(xgb_clf, xgd_test, 'xgb_eta')