In [1]:
%reload_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append("..")

from src.data.prepare_data import *
from src.models.model import *
import time
import random
from sklearn.model_selection import KFold
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data.sampler import WeightedRandomSampler
from tqdm import tqdm

In [4]:
random.seed(2718)
np.random.seed(2718)
torch.manual_seed(2718)
SEED = 2718

In [5]:
train = pd.read_csv("../data/internal/train.csv")
test = pd.read_csv("../data/internal/test.csv")
train_ext = pd.read_csv('../data/external/train.csv')
sub = pd.read_csv("../data/internal/sample_submission.csv")

train_ext['tfrecord'] += 20
train = pd.concat([train, train_ext], axis=0, ignore_index=True)

In [6]:
config = {
    'INPUT_DIR'      : '',
    'MODEL'          : 'alexnet',
    'SIZE'           : 128,
    'BATCH_SIZE'     : 128,
    'NUM_FOLDS'      : 3,
    'NUM_EPOCHS'     : 10,
    'FREEZED_EPOCHS' : 3,
    'LEARNING_RATE'  : 1e-3,
    'EARLY_STOPPING' : 3,
    'UNIFORM_AUGMENT': True,
    'TTA'            : 3,
    'NUM_WORKERS'    : 16,
    'DEVICE'         : 'cpu'
}

In [8]:
t = time.time()
predictions = pd.DataFrame()
transform = ImageTransform(config['SIZE'], config['UNIFORM_AUGMENT'])

skf = KFold(n_splits=config['NUM_FOLDS'], shuffle=True, random_state=SEED)
for i, (idxT,idxV) in enumerate(skf.split(np.arange(15))):
    t_fold = time.time()
    tr = train.loc[train.tfrecord.isin(idxT) | (train.tfrecord >= 20)]
    va = train.loc[train.tfrecord.isin(idxV)]
    tr.reset_index(drop=True, inplace=True)
    va.reset_index(drop=True, inplace=True)

    # create datasets
    dataset_train = MelanomaDataset("../data/internal/train", tr, transform=transform, phase='train')
    dataset_valid = MelanomaDataset("../data/internal/train", va, transform=transform, phase='valid')
    
    # load a pretrained model
    net = load_model(config['MODEL'], 2)

    # define a loss function
    criterion = nn.CrossEntropyLoss()

    # define an optimizer
    optimizer = optim.Adam(net.parameters(), lr=config['LEARNING_RATE'])

    # define a scheduler
    scheduler = ReduceLROnPlateau(optimizer=optimizer, mode='max', patience=2, factor=0.2)

    # create a sampler
    class_sample_count = np.array([len(np.where(tr['target'] == t)[0]) for t in np.unique(tr['target'])])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in tr['target']])
    sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

    # train the network
    print(f"---- fold: {i + 1} ------------")
    train_model(
        f"{config['MODEL']}_{i + 1}",
        dataset_train,
        dataset_valid,
        config['BATCH_SIZE'],
        net,
        criterion,
        optimizer,
        scheduler,
        config['NUM_EPOCHS'],
        config['FREEZED_EPOCHS'],
        config['INPUT_DIR'],
        config['NUM_WORKERS'],
        sampler,
        config['DEVICE'],
        config['EARLY_STOPPING']
    )

    # predict on test dataset
    test['target'] = 0
    dataset_test = MelanomaDataset("../data/internal/test", test, transform=transform, phase='test')
    tta_time = time.time()
    predictions = get_predictions(dataset_test, 
                                  config["BATCH_SIZE"], 
                                  net, 
                                  config["TTA"], 
                                  predictions, 
                                  config["DEVICE"])
    print(f"TTA took {round(time.time() - tta_time, 2)}")
    predictions.to_csv(f'../submissions/{config["MODEL"]}_fold{i+1}.csv')
    print(f"fold took {round(time.time() - t_fold, 2)}")
    
# output
sub['target'] = predictions.mean(axis=1)
sub.to_csv(f"../submissions/submission{config['MODEL']}.csv", index=False)
print(f"total time: {round(time.time() - t, 4)}")

  0%|          | 0/368 [00:00<?, ?it/s]

---- fold: 1 ------------


100%|██████████| 368/368 [05:01<00:00,  1.22it/s]
  0%|          | 0/368 [00:00<?, ?it/s]

epoch: 1, loss_train: 0.7570, loss_valid: 0.2061, auc_valid: 0.6975, saved: True, 363.2218sec


100%|██████████| 368/368 [04:58<00:00,  1.23it/s]
  0%|          | 0/368 [00:00<?, ?it/s]

epoch: 2, loss_train: 0.5409, loss_valid: 0.2636, auc_valid: 0.7038, saved: True, 361.1045sec


100%|██████████| 368/368 [04:58<00:00,  1.23it/s]


epoch: 3, loss_train: 0.5382, loss_valid: 0.1937, auc_valid: 0.7064, saved: True, 362.2898sec


100%|██████████| 368/368 [15:28<00:00,  2.52s/it]
  0%|          | 0/368 [00:00<?, ?it/s]

epoch: 4, loss_train: 0.4715, loss_valid: 0.1491, auc_valid: 0.6781, saved: False, 990.5527sec


100%|██████████| 368/368 [15:03<00:00,  2.46s/it]
  0%|          | 0/368 [00:00<?, ?it/s]

epoch: 5, loss_train: 0.4261, loss_valid: 0.1507, auc_valid: 0.6006, saved: False, 971.4699sec


100%|██████████| 368/368 [17:32<00:00,  2.86s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

epoch: 6, loss_train: 0.4089, loss_valid: 0.1220, auc_valid: 0.4948, saved: False, 1148.2658sec





KeyError: 'tfrecord'

This submission achieved a score of 0.8786