In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve, auc

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('preprocessed_data.csv')
df.head()

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N19,N20,N21,N22,N23,N24,N33,N34,N35,Dependent_Variable
0,Candidate_48134,1,4,2,66,2,False,1,True,11.05,...,25856.0,17.0,0.88,1.0,40.0,10833.33333,160.0,262.1,17.0,0
1,Candidate_51717,1,0,19,2,0,False,0,True,29.0,...,11041.3,21.8,0.9,0.9,20.0,6250.0,24.0,50.29,18.0,1
2,Candidate_26401,1,1,16,47,1,False,4,True,17.99,...,1006.0,6.0,1.0,0.0,26.0,2413.666667,70.0,126.52,27.0,0
3,Candidate_34872,1,1,13,1,1,True,6,True,27.5,...,3398.0,31.0,0.96,0.0,44.0,7666.666667,100.0,205.47,21.0,0
4,Candidate_29660,1,0,30,13,2,False,2,True,13.5,...,4110.0,11.0,0.9,2.0,48.0,4250.0,150.0,254.51,13.0,0


In [3]:
df.shape

(32276, 37)

In [4]:
X = df.drop(['Unique_ID', 'Dependent_Variable'], axis=1).copy()
y = df['Dependent_Variable']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [6]:
clf = xgb.XGBClassifier(objective='binary:logistic', missing=None, seed=42)
clf.fit(X_train,
        y_train,
        verbose=True,
        early_stopping_rounds=10,
        eval_metric='auc',
        eval_set=[(X_test, y_test)])

[0]	validation_0-auc:0.70874
[1]	validation_0-auc:0.72035


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[2]	validation_0-auc:0.72988
[3]	validation_0-auc:0.73533
[4]	validation_0-auc:0.73850
[5]	validation_0-auc:0.74153
[6]	validation_0-auc:0.74315
[7]	validation_0-auc:0.74604
[8]	validation_0-auc:0.74794
[9]	validation_0-auc:0.75091
[10]	validation_0-auc:0.75158
[11]	validation_0-auc:0.75241
[12]	validation_0-auc:0.75309
[13]	validation_0-auc:0.75458
[14]	validation_0-auc:0.75585
[15]	validation_0-auc:0.75684
[16]	validation_0-auc:0.75636
[17]	validation_0-auc:0.75657
[18]	validation_0-auc:0.75685
[19]	validation_0-auc:0.75665
[20]	validation_0-auc:0.75674
[21]	validation_0-auc:0.75672
[22]	validation_0-auc:0.75748
[23]	validation_0-auc:0.75808
[24]	validation_0-auc:0.75798
[25]	validation_0-auc:0.75795
[26]	validation_0-auc:0.75836
[27]	validation_0-auc:0.75862
[28]	validation_0-auc:0.75871
[29]	validation_0-auc:0.75995
[30]	validation_0-auc:0.76058
[31]	validation_0-auc:0.76076
[32]	validation_0-auc:0.76133
[33]	validation_0-auc:0.76182
[34]	validation_0-auc:0.76177
[35]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=None,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

## Optimizing parameters using Cross Validation and Grid Search()

In [7]:
# Round 1 of GridSearchCV()

param_grid = {
    'max_depth' : [6, 7, 8],
    'learning_rate' : [0.1, 0.01, 0.05],
    'gamma' : [0, 0.25, 1],
    'reg_lamda' : [0, 1.0, 10.0],
    'scale_pos_weight' : [1, 3, 5]
}

In [8]:
optimal_params = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic',
                                seed=42,
                                subsample=0.9,
                                colsample_bytree=0.5),
    param_grid=param_grid,
    scoring='roc_auc',
    verbose=10,
    n_jobs=-1,
    cv=3
)

In [9]:
optimal_params.fit(X_train,
                   y_train,
                   early_stopping_rounds=10,
                   eval_metric='auc',
                   eval_set=[(X_test, y_test)],
                   verbose=False)

In [11]:
print(optimal_params.best_params_)

In [12]:
# Round 2 of GridSearchCV()

param_grid = {
    'max_depth' : [8, 9, 10],
    'learning_rate' : [0.05, 0.1, 0.5],
    'gamma' : [1, 1.5, 2],
    'reg_lamda' : [0],
    'scale_pos_weight' : [1]
}

In [13]:
optimal_params = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic',
                                seed=42,
                                subsample=0.9,
                                colsample_bytree=0.5),
    param_grid=param_grid,
    scoring='roc_auc',
    verbose=10,
    n_jobs=-1,
    cv=3
)

In [14]:
optimal_params.fit(X_train,
                   y_train,
                   early_stopping_rounds=10,
                   eval_metric='auc',
                   eval_set=[(X_test, y_test)],
                   verbose=False)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   20.5s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done  75 out of  81 | elapsed:   58.1s remaining:    4.6s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   60.0s finished
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Parameters: { "reg_lamda" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.5,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n...
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                    

In [15]:
print(optimal_params.best_params_)

{'gamma': 1, 'learning_rate': 0.05, 'max_depth': 8, 'reg_lamda': 0, 'scale_pos_weight': 1}


In [12]:
# The best parameters stay the same so I take the parameters and build the final XGBoost Classifier

clf2 = xgb.XGBClassifier(seed=42,
                        objective='binary:logistic',
                        gamma=1,
                        learn_rate=0.05,
                        max_depth=8,
                        reg_lambda=0,
                        scale_pos_weight=1,
                        subsample=0.9,
                        colsample_bytree=0.5)

In [13]:
clf2.fit(X_train,
        y_train,
        verbose=True,
        early_stopping_rounds=10,
        eval_metric='auc',
        eval_set=[(X_test, y_test)])

Parameters: { "learn_rate" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-auc:0.68126
[1]	validation_0-auc:0.71234
[2]	validation_0-auc:0.71990
[3]	validation_0-auc:0.72898
[4]	validation_0-auc:0.73674
[5]	validation_0-auc:0.73724
[6]	validation_0-auc:0.73929
[7]	validation_0-auc:0.74313
[8]	validation_0-auc:0.74541
[9]	validation_0-auc:0.74831
[10]	validation_0-auc:0.74762
[11]	validation_0-auc:0.74866
[12]	validation_0-auc:0.74846
[13]	validation_0-auc:0.75013
[14]	validation_0-auc:0.75048
[15]	validation_0-auc:0.75202
[16]	validation_0-auc:0.75160
[17]	validation_0-auc:0.75111
[18]	validation_0-auc:0.75182
[19]	validation_0-auc:0.75220
[20]	validation_0-auc:0.75006
[21]	validation_0-auc:0.75019
[22]	validation_0-auc:0.75063
[23

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5,
              enable_categorical=False, gamma=1, gpu_id=-1,
              importance_type=None, interaction_constraints='', learn_rate=0.05,
              learning_rate=0.300000012, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
              seed=42, subsample=0.9, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [14]:
y_pred = clf2.predict(X_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [15]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
auc_score = auc(fpr, tpr)
print(f'AUC-ROC = {auc_score}')

AUC-ROC = 0.6458516836812946


## Predicting on provided test data 

In [16]:
test_df = pd.read_csv('preprocessed_test_data.csv')
test_df.head()

Unnamed: 0,Unique_ID,C1,C2,C3,C4,C5,C6,C7,C8,N1,...,N18,N19,N20,N21,N22,N23,N24,N33,N34,N35
0,Candidate_1602,1,0,0,23,0,True,0,True,18.0,...,0.5,10639.6,21.8,0.9,0.9,66.0,3333.333333,50.0,90.38,23.0
1,Candidate_29650,1,0,2,4,2,True,2,True,16.75,...,0.83,12165.0,19.0,0.94,2.0,36.0,5779.833333,300.0,532.93,16.0
2,Candidate_31061,1,2,3,38,1,False,4,True,29.99,...,0.79,504.0,34.0,0.7,2.0,48.0,3083.333333,80.0,169.78,22.0
3,Candidate_5768,1,1,28,20,2,False,2,True,17.7,...,0.84,1428.0,9.0,0.77,0.0,36.0,5117.083333,150.0,270.02,13.0
4,Candidate_27059,1,1,15,1,3,False,5,False,28.0,...,0.64,6324.0,25.0,0.92,2.0,57.7,0.0,50.0,103.41,14.0


In [18]:
x_test_provided = test_df.drop(['Unique_ID'], axis=1).copy()

In [19]:
test_pred = clf2.predict_proba(x_test_provided)
test_pred

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


array([[0.7141714 , 0.28582856],
       [0.7576543 , 0.24234566],
       [0.8013055 , 0.19869453],
       ...,
       [0.65986574, 0.3401343 ],
       [0.8494964 , 0.15050356],
       [0.9186997 , 0.08130033]], dtype=float32)

In [20]:
pred_prob = []
for i in test_pred:
    pred_prob.append(i[1])

In [21]:
data = {'Unique_ID': test_df['Unique_ID'], 'Class_1_Probability': pred_prob}
submission = pd.DataFrame(data)
submission

Unnamed: 0,Unique_ID,Class_1_Probability
0,Candidate_1602,0.285829
1,Candidate_29650,0.242346
2,Candidate_31061,0.198695
3,Candidate_5768,0.283343
4,Candidate_27059,0.644169
...,...,...
11012,Candidate_7453,0.571200
11013,Candidate_38211,0.133411
11014,Candidate_25020,0.340134
11015,Candidate_44501,0.150504


In [22]:
submission.to_csv('Submission.csv', index=False)