# Ensembling the Resnet18 Model and XGBoost on Metadata

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import pickle
import torch
from src.data.prepare_data import *
from src.models.model import *
from src.data.prepare_data import format_tabular
from sklearn.metrics import roc_auc_score

In [3]:
# We use a different seed than during training to create different folds
SEED = 3142

In [21]:
train = pd.read_csv("../data/internal/train.csv")
test = pd.read_csv("../data/internal/test.csv")

In [18]:
X_train, X_test, y_train = format_tabular(train, test)

In [6]:
skf = KFold(n_splits=3, shuffle=True, random_state=SEED)
transform = ImageTransform(64, True)
meta_file = "../models/xgboost_internal_randomoversamplingdist.pkl"
with open(meta_file, 'rb') as file:
    meta_model = pickle.load(file)

alpha_vals = [x / 10 for x in range(1, 10)]
alpha_scores = []
for alpha in tqdm(alpha_vals):
    fold_sum = 0
    for i, (idxT,idxV) in enumerate(skf.split(np.arange(15))):
        validation_ims = train.loc[train.tfrecord.isin(idxV)]
        validation_ims.reset_index(drop=True, 
                                   inplace=True)
        
        X_valid = MelanomaDataset("../data/internal/train", 
                                        validation_ims, 
                                        transform=transform, 
                                        phase='valid')
        
        net = load_model('resnet18', 2)
        net.load_state_dict(
            torch.load(
                f"../models/state_dict_resnet18_{i+1}.pt", 
                map_location='cpu'))
        
        validation_meta = X_train.loc[train.tfrecord.isin(idxV)]
        net_predictions = pd.DataFrame()

        net_predictions = predict(X_valid, 128, net, 3, net_predictions, "cpu")
        net_predictions = net_predictions.mean(axis=1)
        
        meta_preds = meta_model.predict_proba(validation_meta)[:, 1]
        
        preds = alpha * meta_preds + (1 - alpha) * net_predictions
        fold_sum += roc_auc_score(y_true=validation_ims['target'], y_score=preds)
    
    alpha_scores.append(fold_sum / 3)

  0%|          | 0/9 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [02:53<05:46, 173.20s/it][A
 67%|██████▋   | 2/3 [05:37<02:50, 170.47s/it][A
100%|██████████| 3/3 [08:22<00:00, 167.40s/it][A

  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [02:43<05:26, 163.34s/it][A
 67%|██████▋   | 2/3 [05:28<02:43, 163.75s/it][A
100%|██████████| 3/3 [08:12<00:00, 164.12s/it][A

  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [02:43<05:27, 163.73s/it][A
 67%|██████▋   | 2/3 [05:29<02:44, 164.25s/it][A
100%|██████████| 3/3 [08:13<00:00, 164.67s/it][A
 11%|█         | 1/9 [24:49<3:18:37, 1489.69s/it]
  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [02:45<05:30, 165.00s/it][A
 67%|██████▋   | 2/3 [05:29<02:44, 164.93s/it][A
100%|██████████| 3/3 [08:14<00:00, 164.78s/it][A

  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [02:44<05:28, 164.07s/it][A
 67%|██████▋   | 2/3 [05:28<02:44, 164.11s/it][A
100%

In [7]:
alpha_scores

[0.8741562633061227,
 0.882168477494396,
 0.8857577326872245,
 0.8862797408211801,
 0.8840812210190078,
 0.879645277270772,
 0.8728993626479503,
 0.86422148133741,
 0.8541842695875159]

In [13]:
alpha_max_idx = alpha_scores.index(max(alpha_scores))
best_alpha = alpha_vals[alpha_max_idx]

In [14]:
best_alpha

0.4

So the ensemble which performed best on the validation sets averaged over 3 folds used 40% of the tabular result and 60% of the resnet.

Now let's get the predictions for this ensemble on the test set and see how it does on the leaderboard.

In [24]:
meta_preds = meta_model.predict_proba(X_test)[:, 1]

test_ims = MelanomaDataset("../data/internal/test", test, transform=transform, phase='test')
net_predictions = pd.DataFrame()

for i in range(3):        
    net = load_model('resnet18', 2)
    net.load_state_dict(
        torch.load(
            f"../models/state_dict_resnet18_{i+1}.pt", 
            map_location='cpu'))

    for _ in tqdm(range(3)):
        net_preds = predict(test_ims, 128, net, "cpu")
        net_preds = pd.DataFrame(torch.softmax(net_preds, 1)[:, 1].numpy())
        net_predictions = pd.concat([net_predictions, net_preds], axis=1)
net_predictions = net_predictions.mean(axis=1)

preds = best_alpha * meta_preds + (1 - best_alpha) * net_predictions

100%|██████████| 3/3 [02:48<00:00, 56.13s/it]
100%|██████████| 3/3 [02:41<00:00, 53.98s/it]
100%|██████████| 3/3 [02:46<00:00, 55.52s/it]


In [25]:
preds

0        0.023906
1        0.009402
2        0.016432
3        0.008127
4        0.101575
           ...   
10977    0.130427
10978    0.413212
10979    0.477556
10980    0.025919
10981    0.139902
Length: 10982, dtype: float32

In [26]:
sub = pd.read_csv('../data/internal/sample_submission.csv')
sub['target'] = preds
sub.to_csv(f"../submissions/submission_ensemble_resnet_tabular_alpha0.4.csv", index=False)

NameError: name 't' is not defined

This submission scored 0.9035 on the public leaderboard!