# Tabular Playground Series - March

### About

The task for this month's competition is to predict the probability of a binary target. In this notebook, I ended up building a model using XGBoost's Classifier class. I ended up with a score of 0.89133 on the prive leaderboard where the winning score was 0.90057.  

### Initial Setup


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb


from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

# binary classification
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

# evaluation imports
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

# String variable that can be used to timestamp exported objects
from datetime import datetime
current_tmstmp = datetime.today().strftime('%Y%m%d')

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

train = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv')

print('count(*) from train: ', len(train.index))
print('count(*) from test: ', len(test.index))

### Helper Functions

In [None]:
def encodeBinaryLabel(val, one_val):
    if pd.isna(val):
        raise ValueError('Null value found!')
    else:
        if val == one_val:
            return 1
        else:
            return 0

sample_sub_filename = '/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv'

def quickSubmission(sample_sub_path, test_df, model, output_filename):
    sample_submission = pd.read_csv(sample_sub_path)
    x_test = test_df.drop(labels = ['id'], axis = 1).values
    predictions = model.predict_proba(x_test)
    sample_submission['target'] = predictions[:, 1]
    sample_submission.to_csv(output_filename, index = False)

def create_folds(dataframe):
    dataframe['kfold'] = -1
    data = dataframe.sample(frac = 1).reset_index(drop = True)
    bin_num = int(np.floor(1 + np.log2(len(data))))
    data.loc[:, 'bins'] = pd.cut(
        data['target'], bins = bin_num, labels = False
    )
    kfold = StratifiedKFold(n_splits = 5)
    for f, (t_, v_) in enumerate(kfold.split(X = data, y = data['bins'].values)):
        data.loc[v_, 'kfold'] = f
    data = data.drop(labels = ['bins'], axis = 1)
    return data

def run_folds_proba(dataframe, fold, drop_cols, model):
    drop_cols.append('target')
    df_train = dataframe[dataframe.kfold != fold].reset_index(drop = True)
    df_val = dataframe[dataframe.kfold == fold].reset_index(drop = True)
    x_train = df_train.drop(labels = drop_cols, axis = 1).values
    y_train = df_train['target'].values
    x_val = df_val.drop(labels = drop_cols, axis = 1).values
    y_val = df_val['target'].values
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_val)
    # incorporate auc score
    print(roc_auc_score(y_val, y_pred[:, 1]))
    return model

### Quick EDA

- labels are not balanced
- cat0, cat11, cat12, cat13, and cat14 appear to be binary
- cat10 has values that don't appear in both train and test
- all features are populated
- several categorical features contain rows that are populated with similar values
- cat17 and cat18's countplots look exactly the same, which seemed odd to me

In [None]:
print('count(*),', train.groupby(['target']).size())

missing_cat_ft = []

print('Cat Ft - Train - Test')
for col in train.columns:
    if 'cat' in col:
        print(col, '-', train[col].nunique(), '-', test[col].nunique())
        if train[col].nunique() != test[col].nunique():
            missing_cat_ft.append(col)

print("Categorical features with values that don't exist in both train and test sets. ")
print(missing_cat_ft)

In [None]:
cats_missing_in_test = []
cats_missing_in_train = []

for col in train.columns:
    if 'cat' in col:
        for val in train[col].unique():
            if val not in test[col].unique():
                cats_missing_in_test.append(col + '-' + val)
        for val in test[col].unique():
            if val not in train[col].unique():
                cats_missing_in_train.append(col + '-' + val)

# cat10 has a lot of values don't exist in both train and test
# cats_missing_in_test
# cats_missing_in_train

train = train.drop(labels = ['cat10'], axis = 1)
test = test.drop(labels = ['cat10'], axis = 1)

In [None]:
null_features_count = 0

for col in train.columns:
    if train[col].isnull().sum() > 0:
        print(col, 'in train set contains null values')
        null_features_count += 1
    if col != 'target':
        if test[col].isnull().sum() > 0:
            print(col, 'in test set contains null values')
            null_features_count += 1

if null_features_count == 0:
    print('All features contain populated values in both train and test')

### Data Prep/Feature Engineering

Preparing the data for training the model(s) and getting ready to run kfolds. I also create a backup of the train dataframe in case I would like to reference it at some point, it isn't really necessary. 

In [None]:
# before manipulating the train set, create a copy of the dataframe 

train_bkup = train.copy()

# encode binary features, A => 1

binary_fts = ['cat0', 'cat11', 'cat12', 'cat13', 'cat14']

for ft in binary_fts:
    train[ft] = train[ft].apply(lambda x: encodeBinaryLabel(x, 'A'))
    test[ft] = test[ft].apply(lambda x: encodeBinaryLabel(x, 'A'))

one_hot_fts = []

for col in train.columns:
    if 'cat' in col and col not in binary_fts:
        one_hot_fts.append(col)

train = pd.get_dummies(train, columns = one_hot_fts, dummy_na=False)
test = pd.get_dummies(test, columns = one_hot_fts, dummy_na=False)

In [None]:
# create train and validation sets
x = train.drop(labels = ['id', 'target'], axis = 1).values
y = train['target'].values

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.25)

df = create_folds(train)

print('Kfold counts: \n', df.kfold.value_counts())

### Basic Models

I'm going to use some of my submissions to evaluate how well various algorithms work with the default parameters. 

XGBClassifier ended up being the best so moving forward I am focusing on using it. I have seen some great notebooks where others are using the DMatrix objects and passing params to the xgb train method to return a classifier. I'm still gaining experience with XGBoost so I'll just define the model using xgb.XGBClassifier and evaluate it using kfolds.  

Below is a list including how some other binary classification algorithms performed. 

- Logistic Regression - 0.87484
- Random Forest - 0.87944
- Stochastic Gradient Descent - 0.87474
- XGB Clf: 0.88475

In [None]:
drops = ['id', 'kfold']


print('----- XGBoost Classifier -----')
for f in range(len(df['kfold'].unique())):
    mdl = run_folds_proba(
        dataframe = df,
        fold = f,
        drop_cols = drops,
        model = xgb.XGBClassifier(
            n_estimators = 100, 
            learning_rate = 0.2,
            max_depth = 10,
            subsample = 0.9,
            gamma = 5,
            colsample_bytree = 0.2,
            eval_metric = 'auc',
            min_child_weight = 20,
            use_label_encoder=False
        )
    )

### Submission

Train a new instance of the xgb classifier model on the whole train set and submit predictions from the test set. 

In [None]:
# retrain new instance of model on whole training set

xgb_mdl = xgb.XGBClassifier(
    n_estimators = 100, 
    learning_rate = 0.2,
    max_depth = 10,
    subsample = 0.9,
    gamma = 5,
    colsample_bytree = 0.2,
    eval_metric = 'auc',
    min_child_weight = 20,
    use_label_encoder=False
)
xgb_mdl.fit(x, y)

In [None]:
quickSubmission(
    sample_sub_path = sample_sub_filename, 
    test_df = test, 
    model = xgb_mdl, 
    output_filename = 'xgb_clf_tuned_submission.csv'
)