In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage
import pandas as pd
from torch import optim
import re
from pathlib import Path
import numpy as np
import torch

In [2]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def map5(preds, targs):
    predicted_idxs = preds.sort(descending=True)[1]
    top_5 = predicted_idxs[:, :5]
    res = mapk([[t] for t in targs.cpu().numpy()], top_5.cpu().numpy(), 5)
    return torch.tensor(res)

def top_5_preds(preds): return np.argsort(preds.numpy())[:, ::-1][:, :5]

def top_5_pred_labels(preds, classes):
    top_5 = top_5_preds(preds)
    labels = []
    for i in range(top_5.shape[0]):
        labels.append(' '.join([classes[idx] for idx in top_5[i]]))
    return labels

def create_submission(preds, data, name, classes=None):
    if not classes: classes = data.classes
    sub = pd.DataFrame({'Image': [path.name for path in data.test_ds.x.items]})
    sub['Id'] = top_5_pred_labels(preds, classes)
    sub.to_csv(f'{name}.csv.gz', index=False, compression='gzip')

In [3]:
import fastai
from fastprogress import force_console_behavior
import fastprogress
fastprogress.fastprogress.NO_BAR = True
master_bar, progress_bar = force_console_behavior()
fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

In [4]:
DATA_PATH = Path('../data')
list(DATA_PATH.iterdir())

[WindowsPath('../data/only_known'),
 WindowsPath('../data/oversampled_train.csv'),
 WindowsPath('../data/oversampled_train_and_val.csv'),
 WindowsPath('../data/sample_submission.csv'),
 WindowsPath('../data/test'),
 WindowsPath('../data/test.zip'),
 WindowsPath('../data/test_bbox.pk'),
 WindowsPath('../data/train'),
 WindowsPath('../data/train.csv'),
 WindowsPath('../data/train.zip'),
 WindowsPath('../data/train_bbox.pk')]

In [5]:
df = pd.read_csv(DATA_PATH/'train.csv')
val_fns = {'69823499d.jpg'}

In [6]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [7]:
name = f'res50-full-train'

In [8]:
SZ = 224
BS = 64
NUM_WORKERS = 0
SEED=0

In [9]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], DATA_PATH/'train', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder(DATA_PATH/'test'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path=DATA_PATH/'only_known')
        .normalize(imagenet_stats)
)

In [10]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048])
learn.clip_grad();

learn.fit_one_cycle(7, 1e-2)
learn.save(DATA_PATH/'only_known/{name}-stage-1')

learn.unfreeze()

max_lr = 1e-3
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(10, lrs)
learn.save(DATA_PATH/'only_known/{name}-stage-2')

epoch     train_loss  valid_loss


RuntimeError: Expected object of scalar type Long but got scalar type Int for argument #2 'target'

In [11]:
SZ = 224 * 2
BS = 64 // 4
NUM_WORKERS = 0
SEED=0

In [12]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], DATA_PATH/'train', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder(DATA_PATH/'test'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path=DATA_PATH/'only_known')
        .normalize(imagenet_stats)
)

In [13]:
%%time
learn = create_cnn(data, models.resnet50, lin_ftrs=[2048])
learn.clip_grad();
learn.load(DATA_PATH/'only_known/{name}-stage-2')
learn.freeze_to(-1)

learn.fit_one_cycle(5, 1e-2 / 4)
learn.save(DATA_PATH/'only_known/{name}-stage-3')

learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(9, lrs)
learn.save(DATA_PATH/'only_known/{name}-stage-4')

FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\only_known\\models\\..\\data\\only_known\\{name}-stage-2.pth'

In [14]:
# with oversampling
df = pd.read_csv(DATA_PATH/'oversampled_train_and_val.csv')

In [15]:
data = (
    ImageItemList
        .from_df(df, DATA_PATH/'train', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder(DATA_PATH/'test'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path=DATA_PATH/'only_known')
        .normalize(imagenet_stats)
)

In [16]:
%%time
learn = create_cnn(data, models.resnet50, lin_ftrs=[2048])
learn.clip_grad();
learn.load(DATA_PATH/'only_known/{name}-stage-4')
learn.freeze_to(-1)

learn.fit_one_cycle(2, 1e-2 / 4)
learn.save(DATA_PATH/'only_known/{name}-stage-5')

learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

learn.fit_one_cycle(3, lrs)
learn.save(DATA_PATH/'only_known/{name}-stage-6')

FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\only_known\\models\\..\\data\\only_known\\{name}-stage-4.pth'

## Predict

In [17]:
preds, _ = learn.get_preds(DatasetType.Test)

KeyboardInterrupt: 

In [None]:
preds = torch.cat((preds, torch.ones_like(preds[:, :1])), 1)

In [None]:
preds[:, 5004] = 0.06

In [None]:
classes = learn.data.classes + ['new_whale']

In [None]:
create_submission(preds, learn.data, name, classes)

In [None]:
pd.read_csv(DATA_PATH/'only_known/{name}.csv.gz').head()

In [None]:
pd.read_csv(DATA_PATH/'only_known/{name}.csv.gz').Id.str.split().apply(lambda x: x[0] == 'new_whale').mean()

In [None]:
!kaggle competitions submit -c humpback-whale-identification -f subs/{name}.csv.gz -m "{name}"