## Starter notebook | TPS September - XGBoost

In [None]:
# Importing necessary libraries   
import pandas as pd
import numpy as np
from scipy import stats
import warnings
from collections import Counter
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn.model_selection import KFold

import xgboost as xgb

warnings.filterwarnings('ignore')
RANDOM_STATE = 1234

## Data reading and basic preprocessing

In [None]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [None]:
train.head()

In [None]:
# this code adopted from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage. Thanks to the original author.

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train =  reduce_mem_usage(train)
test =  reduce_mem_usage(test)

In [None]:
df_info = {col:np.std(train[col]) for col in train.columns if col not in ['id','claim']}
df_info = pd.DataFrame.from_dict(df_info,columns=['std'],orient='index')
df_info['max'] = [np.max(train[col]) for col in list(df_info.index)]
df_info['min'] = [np.min(train[col]) for col in list(df_info.index)]
df_info['null_count'] =[(train[col]).isna().sum() for col in list(df_info.index)]


df_info.style\
    .background_gradient(cmap="Blues", subset=['std'])\
    .background_gradient(cmap="YlOrRd", subset=['max'])\
    .background_gradient(cmap="bone", subset=['min'])\
    .background_gradient(cmap="Greens", subset=['null_count'])

In [None]:
train['f_missing'] = train.isna().sum(axis=1)
test['f_missing'] = test.isna().sum(axis=1)

In [None]:
kf = KFold(n_splits=5,shuffle=True,random_state=RANDOM_STATE)
train['kfold'] = None

for fold,(train_idx,valid_idx) in enumerate(kf.split(X=train)):
    train.loc[valid_idx,'kfold'] = fold

In [None]:
train.head()

In [None]:
features = [col for col in train.columns if col.startswith('f')]

# Modeling

In [None]:
#get params
params = xgb.XGBClassifier().get_params()

In [None]:
params

In [None]:
ratio = train[train['claim']==1].shape[0]/train[train['claim']==0].shape[0]

In [None]:
# split data into training and validation
xtrain = train[train['kfold']!=1]
xvalid = train[train['kfold']==1]

ytrain = xtrain['claim']
yvalid = xvalid['claim']
xtrain = xtrain[features]
xvalid = xvalid[features]

xtest = test[features]

In [None]:
params['n_estimators'] = 1000
params['importance_type'] = 'weight'
params['max_depth'] = 8
params['max_delta_step'] = 3
params['colsample_bytree'] = 1
params['learning_rate'] = 0.05
params['booster'] = 'dart'
params['verbosity'] = 2
params['eval_metric'] = 'auc'
params['tree_method'] = 'gpu_hist'
params['random_state'] = RANDOM_STATE
params['num_parallel_tree'] = 10
params['scale_pos_weight'] = ratio
params['eta'] = 0.1
params['subsample'] = 0.8
params['objective'] = 'binary:logistic'
params['seed'] = RANDOM_STATE
params['nthread'] = -1
params['silent'] = False
params['predictor'] = 'gpu_predictor'

# Model training

In [None]:
watchlist = xgb.DMatrix(xvalid, label=yvalid, missing = np.nan)
dtrain = xgb.DMatrix(xtrain,label=ytrain,missing= np.nan)
watchlist = [(watchlist,'validation')]
plst = list(params.items())
num_round = 50
bst = xgb.train(plst, dtrain, num_round, watchlist)

# Plot feature importance 

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(bst, max_num_features=80, height=0.8, ax=ax)
plt.show()

# Final Predictions

In [None]:
dtest = xgb.DMatrix(xtest,missing= np.nan)
Predictions = bst.predict(dtest)
sample_submission['claim'] = Predictions
sample_submission.to_csv('submission.csv',index=False)