# Predictions using XGBoost on Tabular Data

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd
from src.data.prepare_data import format_tabular
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import RandomizedSearchCV
import pickle
from scipy import stats

In [3]:
SEED = 2718

In [4]:
train_internal = pd.read_csv('../data/internal/train.csv')
train_external = pd.read_csv('../data/external/train.csv')
test_raw = pd.read_csv('../data/internal/test.csv')

In [5]:
train_internal.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0,1872,1053
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,11,6000,4000


In [6]:
test_raw.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,width,height
0,ISIC_0052060,IP_3579794,male,70.0,,6000,4000
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity,6000,4000
2,ISIC_0058510,IP_7960270,female,55.0,torso,6000,4000
3,ISIC_0073313,IP_6375035,female,50.0,torso,6000,4000
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity,1920,1080


First we train using internal data. We will try random over sampling of the minority class (malignant) and synthetic minority over-sampling (SMOTE).

In [7]:
X_train, X_test, y_train = format_tabular(train_internal, test_raw)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["sex"] = train_c["sex"].apply(lambda x: 1.0 if x == "male" else 0.0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["sex"] = test_c["sex"].apply(lambda x: 1.0 if x == "male" else 0.0)


In [8]:
X_train.isna().sum()

sex                0
age_approx         0
width              0
height             0
NA                 0
head/neck          0
lower extremity    0
oral/genital       0
palms/soles        0
torso              0
upper extremity    0
dtype: int64

In [9]:
X_train.head()

Unnamed: 0,sex,age_approx,width,height,NA,head/neck,lower extremity,oral/genital,palms/soles,torso,upper extremity
0,1.0,45.0,6000,4000,0,1,0,0,0,0,0
1,0.0,45.0,6000,4000,0,0,0,0,0,0,1
2,0.0,50.0,1872,1053,0,0,1,0,0,0,0
3,0.0,45.0,1872,1053,0,1,0,0,0,0,0
4,0.0,55.0,6000,4000,0,0,0,0,0,0,1


In [10]:
X_test.head()

Unnamed: 0,sex,age_approx,width,height,NA,head/neck,lower extremity,oral/genital,palms/soles,torso,upper extremity
0,1.0,70.0,6000,4000,1,0,0,0,0,0,0
1,1.0,40.0,6000,4000,0,0,1,0,0,0,0
2,0.0,55.0,6000,4000,0,0,0,0,0,1,0
3,0.0,50.0,6000,4000,0,0,0,0,0,1,0
4,0.0,45.0,1920,1080,0,0,1,0,0,0,0


In [11]:
config = {
    'NUM_FOLDS': 5
}

In [12]:
model_randoversamp = Pipeline([
    ('sampler', RandomOverSampler(random_state=SEED)),
    ('classification', XGBClassifier(verbosity=1, random_state=SEED))
])

In [13]:
params_randoversamp = {
    'sampler__sampling_strategy': [0.1, 0.3, 0.5],
    'classification__eta': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'classification__gamma': [0, 1, 2, 3, 4, 5],
    'classification__max_depth': [1, 2, 3, 4, 5, 6]
}

In [14]:
cv_iterator_int = []
skf = KFold(n_splits=config['NUM_FOLDS'], shuffle=True, random_state=SEED)
for i, (idxT,idxV) in enumerate(skf.split(np.arange(15))):
    cv_iterator_int.append((train_internal.tfrecord.isin(idxT), 
                        train_internal.tfrecord.isin(idxV)))

In [15]:
grid_randoversamp = RandomizedSearchCV(estimator=model_randoversamp, 
                          param_distributions=params_randoversamp, 
                          n_iter=100, 
                          scoring='roc_auc', 
                          cv=cv_iterator_int, 
                          verbose=1, 
                          n_jobs=-1)

In [16]:
grid_randoversamp.fit(X=X_train, y=y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.5min finished


RandomizedSearchCV(cv=[(0        True
1        True
2        True
3        True
4        True
         ... 
33121    True
33122    True
33123    True
33124    True
33125    True
Name: tfrecord, Length: 33126, dtype: bool,
                        0        False
1        False
2        False
3        False
4        False
         ...  
33121    False
33122    False
33123    False
33124    False
33125    False
Name: tfrecord, Length: 33126, dtype: bool),
                       (0         True
1         True
2        False
3         True
4         True
         ...  
33121     True
33122     True
33123     True
33124     True
33125     True
Name: tfrecord, Leng...
                                                            reg_lambda=None,
                                                            scale_pos_weight=None,
                                                            subsample=None,
                                                            tree_method=None,
                 

In [17]:
grid_randoversamp.best_estimator_

Pipeline(steps=[('sampler',
                 RandomOverSampler(random_state=2718, sampling_strategy=0.3)),
                ('classification',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eta=1, gamma=2, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='', learning_rate=1,
                               max_delta_step=0, max_depth=1,
                               min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=2718,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=1))])

In [18]:
grid_randoversamp.best_score_

0.8299249313327147

In [19]:
submission = pd.read_csv('../data/internal/sample_submission.csv')

Unnamed: 0,image_name,target
0,ISIC_0052060,0
1,ISIC_0052349,0
2,ISIC_0058510,0
3,ISIC_0073313,0
4,ISIC_0073502,0
...,...,...
10977,ISIC_9992485,0
10978,ISIC_9996992,0
10979,ISIC_9997917,0
10980,ISIC_9998234,0


In [20]:
preds_randoversamp = grid_randoversamp.predict_proba(X_test)
preds_randoversamp[:, 1]

array([0.0313531 , 0.01181102, 0.01990261, ..., 0.3367443 , 0.04801606,
       0.21986796], dtype=float32)

In [21]:
submission['target'] = preds_randoversamp[:, 1]

In [22]:
submission.to_csv('../submissions/xgboost_internal_randomoversampling.csv', index=False)

In [23]:
filename = "../models/xgboost_internal_randomoversampling.pkl"
with open(filename, 'wb') as file:
    pickle.dump(grid_randoversamp, file)

This submission achieved a score of 0.7981

In [24]:
model_smote = Pipeline([
    ('sampler', SMOTE(random_state=SEED, n_jobs=-1)),
    ('classification', XGBClassifier(verbosity=1, random_state=SEED))
])

In [25]:
params_smote = {
    'sampler__k_neighbors': [3, 5, 7, 9],
    'classification__eta': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'classification__gamma': [0, 1, 2, 3, 4, 5],
    'classification__max_depth': [1, 2, 3, 4, 5, 6]
}

In [27]:
grid_smote = RandomizedSearchCV(estimator=model_smote, 
                                param_distributions=params_smote, 
                                n_iter=100, 
                                scoring='roc_auc', 
                                cv=cv_iterator_int, 
                                verbose=1, 
                                n_jobs=-1)

In [28]:
grid_smote.fit(X=X_train, y=y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   58.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  9.4min finished


RandomizedSearchCV(cv=[(0        True
1        True
2        True
3        True
4        True
         ... 
33121    True
33122    True
33123    True
33124    True
33125    True
Name: tfrecord, Length: 33126, dtype: bool,
                        0        False
1        False
2        False
3        False
4        False
         ...  
33121    False
33122    False
33123    False
33124    False
33125    False
Name: tfrecord, Length: 33126, dtype: bool),
                       (0         True
1         True
2        False
3         True
4         True
         ...  
33121     True
33122     True
33123     True
33124     True
33125     True
Name: tfrecord, Leng...
                                                            reg_alpha=None,
                                                            reg_lambda=None,
                                                            scale_pos_weight=None,
                                                            subsample=None,
                   

In [29]:
grid_smote.best_estimator_

Pipeline(steps=[('sampler', SMOTE(k_neighbors=3, n_jobs=-1, random_state=2718)),
                ('classification',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eta=0.1, gamma=2, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.100000001, max_delta_step=0,
                               max_depth=2, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=2718,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=1))])

In [30]:
grid_smote.best_score_

0.8113968373586058

In [31]:
preds_smote = grid_smote.predict_proba(X_test)

In [32]:
submission = pd.read_csv('../data/internal/sample_submission.csv')
submission['target'] = preds_smote[:, 1]
submission.to_csv('../submissions/xgboost_internal_smote.csv', index=False)

This achieved a score of 0.7664 on the public leaderboard.

Now let's try adding external data with random over-sampling, as that performed better than the SMOTE sampling.

In [34]:
X_ext_train, y_ext_train = format_tabular(train_external)

The 2019 data provided three separate torso areas for the lesions. The 2020 data only contained a general torso category so the three must be combined to one to match the internal training data

In [35]:
X_ext_train['torso'] = X_ext_train['anterior torso'] + X_ext_train['posterior torso'] + X_ext_train['lateral torso']

In [36]:
X_ext_train.drop(['anterior torso', 'posterior torso', 'lateral torso'], axis=1, inplace=True)

In [37]:
X_train_all = pd.concat([X_train, X_ext_train], axis=0, ignore_index=True)
y_train_all = pd.concat([y_train, y_ext_train], axis=0, ignore_index=True)

In [38]:
tf_int = train_internal['tfrecord']
tf_ext = train_external['tfrecord']
tf_ext += 20
tf = pd.concat([tf_int, tf_ext], axis=0, ignore_index=True)

In [39]:
cv_iterator_ext = []
skf = KFold(n_splits=config['NUM_FOLDS'], shuffle=True, random_state=SEED)
for i, (idxT,idxV) in enumerate(skf.split(np.arange(15))):
    cv_iterator_ext.append((tf.isin(idxT) | (tf >= 20), 
                        tf.isin(idxV)))

In [40]:
grid_randoversamp_ext = RandomizedSearchCV(estimator=model_randoversamp, 
                                       param_distributions=params_randoversamp, 
                                       n_iter=100, 
                                       scoring='roc_auc', 
                                       cv=cv_iterator_ext, 
                                       verbose=1, 
                                       n_jobs=-1)

In [41]:
grid_randoversamp_ext.fit(X=X_train_all, y=y_train_all)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  54 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done 204 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 454 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.6min finished


RandomizedSearchCV(cv=[(0        True
1        True
2        True
3        True
4        True
         ... 
58452    True
58453    True
58454    True
58455    True
58456    True
Name: tfrecord, Length: 58457, dtype: bool,
                        0        False
1        False
2        False
3        False
4        False
         ...  
58452    False
58453    False
58454    False
58455    False
58456    False
Name: tfrecord, Length: 58457, dtype: bool),
                       (0         True
1         True
2        False
3         True
4         True
         ...  
58452     True
58453     True
58454     True
58455     True
58456     True
Name: tfrecord, Leng...
                                                            reg_lambda=None,
                                                            scale_pos_weight=None,
                                                            subsample=None,
                                                            tree_method=None,
                 

In [42]:
grid_randoversamp_ext.best_estimator_

Pipeline(steps=[('sampler',
                 RandomOverSampler(random_state=2718, sampling_strategy=0.5)),
                ('classification',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eta=0.1, gamma=3, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.100000001, max_delta_step=0,
                               max_depth=3, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=2718,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               subsample=1, tree_method='exact',
                               validate_parameters=1, verbosity=1))])

In [43]:
grid_randoversamp_ext.best_score_

0.8279512301642118

This score is almost exactly as using the internal data so we will not try submitting. Finally, can we find any parameters for the classification which perform better by searching for longer using actual distributions for the parameters rather than lists.

In [53]:
params_randoversamp = {
    'sampler__sampling_strategy': stats.uniform(0.1, 0.5),
    'classification__eta': stats.gamma(a=2, scale=0.5),
    'classification__gamma': stats.gamma(a=2, scale=0.5),
    'classification__max_depth': stats.randint(low=1, high=10)
}

In [54]:
grid_randoversamp = RandomizedSearchCV(estimator=model_randoversamp, 
                                       param_distributions=params_randoversamp, 
                                       n_iter=100, scoring='roc_auc', 
                                       cv=cv_iterator_int, 
                                       verbose=1, 
                                       random_state=SEED, 
                                       n_jobs=-1)

In [55]:
grid_randoversamp.fit(X=X_train, y=y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   39.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  7.6min finished


RandomizedSearchCV(cv=[(0        True
1        True
2        True
3        True
4        True
         ... 
33121    True
33122    True
33123    True
33124    True
33125    True
Name: tfrecord, Length: 33126, dtype: bool,
                        0        False
1        False
2        False
3        False
4        False
         ...  
33121    False
33122    False
33123    False
33124    False
33125    False
Name: tfrecord, Length: 33126, dtype: bool),
                       (0         True
1         True
2        False
3         True
4         True
         ...  
33121     True
33122     True
33123     True
33124     True
33125     True
Name: tfrecord, Leng...
                   param_distributions={'classification__eta': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f304a8cf580>,
                                        'classification__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f304a8fd130>,
                                        'classification__ma

In [56]:
grid_randoversamp.best_estimator_

Pipeline(steps=[('sampler',
                 RandomOverSampler(random_state=2718,
                                   sampling_strategy=0.3674052158657565)),
                ('classification',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eta=0.6121981658357913,
                               gamma=0.8272470518531806, gpu_id=-1,
                               importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.612198174, max_delta_step=0,
                               max_depth=1, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=0, num_parallel_tree=1, random_state=2718,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                          

In [57]:
grid_randoversamp.best_score_

0.8308437205397838

This is slightly better than before. Let's see how it does on the test set and save the model.

In [60]:
preds_randover = grid_randoversamp.predict_proba(X_test)
submission = pd.read_csv('../data/internal/sample_submission.csv')
submission['target'] = preds_randover[:, 1]
submission.to_csv('../submissions/xgboost_internal_randover_dist.csv', index=False)

This one scored 0.8151 so a slight improvement by searching using more varied parameters.

In [61]:
filename = "../models/xgboost_internal_randomoversamplingdist.pkl"
with open(filename, 'wb') as file:
    pickle.dump(grid_randoversamp, file)