# Tabular Playground Series - May 2022

I fit an XGBoost (Regressor) model. Feature engineering includes separation of [f_27 string feature](https://www.kaggle.com/code/nnjjpp/eda-may-2022-exploring-the-string-feature-f-27), and feature interactions taken from https://www.kaggle.com/competitions/tabular-playground-series-may-2022/discussion/323892. XGBoost hyperparameters tuned using RandomizedSearchCV. To speed things up I use the GPU accelerator with the `tree_method = 'gpu_hist'` option in the XGBRegressor constructor.

In [None]:
import itertools as it
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

## Notebook parameters
N_ESTIMATORS_TUNING = 120
N_ESTIMATORS_FITTING = 1500
N_PARAMETER_SAMPLES = 20
PARAMETER_SPLIT = 3
XGB_TREE_METHOD ='gpu_hist'#'auto'#
CV_RANDOM_STATE = 123

# Read in data

train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/train.csv', index_col='id')
summary_stats = train.describe().T
summary_stats
train.head()

test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2022/test.csv', index_col='id')
test_summary_stats = test.describe().T
test.head()
#test_summary_stats

N = train.shape[0] + test.shape[0]
Ntrain = train.shape[0]



In [None]:
target = train['target']
train = train.drop('target', axis=1)

both = pd.concat([train, test])

# Feature engineering
## split f_27 variable

In [None]:
both['f_27'].head()

In [None]:
f27 = both['f_27']
both = both.drop('f_27', axis=1)
print(f27)

In [None]:
f27.head().apply(lambda x: pd.Series(list(x)))
#both[[f'f_27_{i} for i in range(10)']] = f27.apply(lambda x: pd.Series(list(x)))
both[[f'f_27_{i}' for i in range(10)]] = pd.DataFrame([list(x) for x in f27])
both.head()
ordA = ord('A')
for i in range(10):
    lab_i = f'f_27_{i}'
    both[lab_i] = both[lab_i].map(lambda x: ord(x) - ordA)

both.head()
    




## Unique string values

In [None]:
both['unique_vals_f_27'] = f27.map(lambda x: len(np.unique(list(x))))

## Interactions
See https://www.kaggle.com/competitions/tabular-playground-series-may-2022/discussion/323892

In [None]:
both['i_02_21'] = (both.f_21 + both.f_02 > 5.2).astype(int) - \
                  (both.f_21 + both.f_02 < -5.3).astype(int)
both['i_05_22'] = (both.f_22 + both.f_05 > 5.1).astype(int) - \
                  (both.f_22 + both.f_05 < -5.4).astype(int)
i_00_01_26 = both.f_00 + both.f_01 + both.f_26
both['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - \
                   (i_00_01_26 < -5.0).astype(int)


In [None]:
both.head()

In [None]:
both.describe().T

In [None]:
Xtrain = both.iloc[:Ntrain,:]
Xtest = both.iloc[Ntrain:,:]
ytrain = target

# Model fitting - XGBoost

## Explore effect of number of estimators 

In [None]:
%%time 

from xgboost import XGBRegressor

from sklearn.model_selection import cross_validate, ShuffleSplit#train_test_split

from sklearn.metrics import roc_auc_score, make_scorer

X = [5,10,20,50,100]
cv_auc = []
for N_ESTIMATORS in X:
    print(N_ESTIMATORS)
    xgb = XGBRegressor(n_estimators = N_ESTIMATORS,
                       objective = 'binary:logistic',
                       eval_metric = 'auc',
                      tree_method = XGB_TREE_METHOD)


    splitter = ShuffleSplit(n_splits=1, random_state=1, test_size=0.15) # equivalent to train_test_split

    cv = cross_validate(xgb, Xtrain, ytrain, n_jobs=-1, scoring = make_scorer(roc_auc_score),
                       cv = splitter)
    cv_auc.append(cv['test_score'])
import matplotlib.pyplot as plt
plt.plot(X, cv_auc)

## Hyperparameter tuning
I look at gamma, eta, subsample and min_child_weight. Search one at a time using RandomizedSearchCV.

In [None]:

%%time 
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

params = {'gamma':[0,50],
          'eta':[0,1],
          'subsample':[0,1],
          'min_child_weight':[0,50]}
defaults = {'gamma':0,
          'eta':0.3,
          'subsample':1,
          'min_child_weight':1}
final = {}
plt.figure(figsize=(12,3))
for i, pname in enumerate(params.keys()):
    print('\n',pname)
    model = XGBRegressor(n_estimators = N_ESTIMATORS_TUNING,
                           objective = 'binary:logistic',
                           eval_metric = 'auc',
                        tree_method = XGB_TREE_METHOD)
    rcv = RandomizedSearchCV(estimator = model, 
                            param_distributions = {pname: uniform(*params[pname])},
                            n_iter=N_PARAMETER_SAMPLES,
                            verbose=0,#99,
                            n_jobs=-1,
                            scoring = make_scorer(roc_auc_score),
                            cv = PARAMETER_SPLIT,
                            random_state = CV_RANDOM_STATE)
    rcv.fit(Xtrain, ytrain)

    # Plot the results:
    plt.subplot(1,len(params),i+1)
    plt.plot([x[pname] for x in rcv.cv_results_['params']], rcv.cv_results_['mean_test_score'],'o')
    plt.title(pname)
    g = plt.gca()
    x = np.arange(20)/19 * (params[pname][1] - params[pname][0]) + params[pname][0]
    xm = np.column_stack([pow(x, i) for i in range(2,-1,-1)])

    # Fit a quadratic to the cv results and find maximum value:
    p = np.polyfit([x[pname] for x in rcv.cv_results_['params']], rcv.cv_results_['mean_test_score'], 2)
    plt.plot(x, np.dot(xm, p))
    g.set_ylim([0.7,1.0])
    def f(z, pp):
        return pp[0] * z**2 + p[1] * z + p[2]
    try: 
        
        # y = p[0]x**2 + p[1]x + p[2]
        # dy/dx = 2p[0]x + p[1]
        # max/min occurs at dy/dx = 0:
        # mx_x = -p[1]/2/p[0]
        mx_x0 = -p[1]/2/p[0]
        # d2y/dx2 = 2p[0]

        # If maximum of f occurs outside search range,
        # just set it equal to one of the edges:
        mx_x = max(params[pname][0], mx_x0)
        mx_x = min(params[pname][1], mx_x)
        #
        # Check edges of parameter search range:
        if f(params[pname][0], p) > f(mx_x, p):
            mx_x = params[pname[0]]
        if f(params[pname][1], p) > f(mx_x, p):
            mx_x = params[pname[1]]
    except:
        print("  Default used (polyfit didn't work)")
        mx_x = defaults[pname]
    print(f'  Max value: {mx_x:.3f}')
    final[pname] = mx_x

print('Final parameters:')
print(final)


## Fit model with optimal parameters on all data with more estimators

In [None]:
xgb = XGBRegressor(n_estimators = N_ESTIMATORS_FITTING,
                   objective = 'binary:logistic',
                   eval_metric = 'auc',
                   gamma = final['gamma'],
                   eta = final['eta'],
                   subsample = final['subsample'],
                   min_child_weight = final['min_child_weight'],
                   tree_method = XGB_TREE_METHOD)

In [None]:
%%time 
xgb.fit(Xtrain, ytrain)
ypred = xgb.predict(Xtest)
print(ypred)

## Model evaluation

In [None]:
import matplotlib.pyplot as plt
ytrain_predict = xgb.predict(Xtrain)
plt.plot(ytrain, ytrain_predict,'o')


print(f'ROC score on training data = {roc_auc_score(ytrain, ytrain_predict):.3f}')


In [None]:
plt.hist(ytrain_predict - ytrain)

In [None]:
trainerr = ytrain_predict - ytrain

plt.figure(figsize=(25,15))
for i in range(40):
    plt.subplot(5,8,i+1)
    plt.plot(trainerr,Xtrain.iloc[:,i],'o')

In [None]:
# Function courtesy of Tyrion Lannister-lzy:
# https://www.kaggle.com/code/tyrionlannisterlzy/xgboost-dnn-ensemble-lb-0-980

def plot_feature_importance(importance, names, model_type, max_features = 10):
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    fi_df = fi_df.head(max_features)

    #Define size of bar plot
    plt.figure(figsize=(8,6))

    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' feature importance plot')
    plt.xlabel('IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

import seaborn as sns
import matplotlib.pyplot as plt
plot_feature_importance(xgb.feature_importances_,
                        Xtrain.columns,
                        f'XGBoost Regressor, {N_ESTIMATORS_FITTING} estimators', max_features = 25)

# Create submission file

In [None]:
submission = pd.DataFrame({'id': test.index,
                           'target': ypred})
submission.to_csv('submission.csv', index=False)