Code taken from https://github.com/radekosmulski/whale/blob/master/oversample.ipynb

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage

import pandas as pd
from torch import optim
import re

from utils import *

In [2]:
import fastai
from fastprogress import force_console_behavior
import fastprogress
fastprogress.fastprogress.NO_BAR = True
master_bar, progress_bar = force_console_behavior()
fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

In [3]:
MODEL_PATH = "../model/"

In [4]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [5]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [6]:
val_fns = pd.read_pickle('../data/10_val_fns')

In [7]:
SZ = 224
BS = 64
NUM_WORKERS = 16
SEED=0

In [8]:
name = '13-res50-full-train'

In [9]:
df = pd.read_csv('../data/10_oversampled_train.csv')

In [10]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], '../data/train-extracted-224', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test-extracted-224'))
        .transform(get_transforms(do_flip=False, max_zoom=1, max_warp=0, max_rotate=2), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [11]:
data

ImageDataBunch;

Train: LabelList (76174 items)
x: ImageItemList
Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224)
y: CategoryList
w_0003639,w_0003639,w_0003639,w_0003639,w_0003639
Path: ../data/train-extracted-224;

Valid: LabelList (2931 items)
x: ImageItemList
Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224)
y: CategoryList
w_cb622a2,w_8dddbee,w_8a6a8d5,w_3881f28,w_cee684e
Path: ../data/train-extracted-224;

Test: LabelList (7960 items)
x: ImageItemList
Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224)
y: EmptyLabelList
,,,,
Path: ../data/train-extracted-224

In [12]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048], model_dir=MODEL_PATH)
learn.clip_grad();

CPU times: user 4.1 s, sys: 1.03 s, total: 5.13 s
Wall time: 3.83 s


In [13]:
%%time

learn.fit_one_cycle(24, 1e-2)
learn.save(f'{name}-stage-1')

epoch     train_loss  valid_loss
1         0.975592    6.271087    
2         0.772474    6.506855    
3         1.080839    7.990137    
4         0.906575    8.282987    
5         0.631940    7.288753    
6         0.570418    7.019714    
7         0.551152    6.649921    
8         0.480917    7.295500    
9         0.391479    7.653440    
10        0.342429    7.667839    
11        0.321125    7.550643    
12        0.220957    7.104164    
13        0.180251    6.893097    
14        0.166522    6.713761    
15        0.101804    6.351599    
16        0.055226    5.828933    
17        0.026075    5.415598    
18        0.019603    4.993219    
19        0.009923    4.746617    
20        0.001951    4.274943    
21        0.000689    4.027005    
22        0.002590    3.894199    
23        0.000905    3.842482    
24        0.000145    3.839142    
Total time: 1:38:13
CPU times: user 1h 7min 50s, sys: 29min 33s, total: 1h 37min 23s
Wall time: 1h 38min 14s


In [14]:
learn.unfreeze()

max_lr = 1e-4
lrs = [max_lr/100, max_lr/10, max_lr]

In [15]:
%%time

learn.fit_one_cycle(24, lrs)
learn.save(f'{name}-stage-2')

epoch     train_loss  valid_loss
1         0.001228    3.829298    
2         0.000106    3.834505    
3         0.000655    3.843672    
4         0.000312    3.866990    
5         0.000138    3.857727    
6         0.000230    3.910743    
7         0.000198    3.889875    
8         0.000822    3.928916    
9         0.000224    3.929289    
10        0.000420    3.927934    
11        0.001068    3.797371    
12        0.000256    3.763932    
13        0.001736    3.765885    
14        0.000435    3.800071    
15        0.000141    3.755549    
16        0.000191    3.728598    
17        0.000104    3.708236    
18        0.000019    3.665085    
19        0.000030    3.655184    
20        0.000010    3.632715    
21        0.000130    3.632558    
22        0.000015    3.619079    
23        0.000062    3.618154    
24        0.000035    3.645159    
Total time: 2:09:44
CPU times: user 1h 29min 44s, sys: 39min 14s, total: 2h 8min 59s
Wall time: 2h 9min 45s


In [10]:
SZ = 224 * 2
BS = 64 // 4
NUM_WORKERS = 16
SEED=0

In [11]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], '../data/train-extracted-448', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test-extracted-448'))
        .transform(get_transforms(do_flip=False, max_zoom=1, max_warp=0, max_rotate=2), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [12]:
data

ImageDataBunch;

Train: LabelList (76174 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_0003639,w_0003639,w_0003639,w_0003639,w_0003639
Path: ../data/train-extracted-448;

Valid: LabelList (2931 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_cb622a2,w_8dddbee,w_8a6a8d5,w_3881f28,w_cee684e
Path: ../data/train-extracted-448;

Test: LabelList (7960 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: EmptyLabelList
,,,,
Path: ../data/train-extracted-448

In [13]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048], model_dir=MODEL_PATH)
learn.clip_grad();
learn.load(f'{name}-stage-2')
learn.freeze_to(-1)

CPU times: user 3.83 s, sys: 1.26 s, total: 5.09 s
Wall time: 4.37 s


In [None]:
# with oversampling
df = pd.read_csv('../data/10_oversampled_train_and_val.csv')

In [None]:
data = (
    ImageItemList
        .from_df(df, '../data/train-extracted-448', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test-extracted-448'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [None]:
data

In [None]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048], model_dir=MODEL_PATH)
learn.clip_grad();
learn.load(f'{name}-stage-4')
learn.freeze_to(-1)

In [None]:
%%time

learn.fit_one_cycle(2, 1e-2 / 4)
learn.save(f'{name}-stage-5')

In [None]:
learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

In [None]:
%%time

learn.fit_one_cycle(3, lrs)
learn.save(f'{name}-stage-6')

In [15]:
preds, _ = learn.get_preds(DatasetType.Test)

In [16]:
preds = torch.cat((preds, torch.ones_like(preds[:, :1])), 1)

In [17]:
preds[:, 5004] = 0.06

In [18]:
classes = learn.data.classes + ['new_whale']

In [19]:
create_submission(preds, learn.data, name, classes)

In [20]:
pd.read_csv(f'../subs/{name}.csv.gz').head()

Unnamed: 0,Image,Id
0,ef60d186c.jpg,w_83cc0ec new_whale w_0815d2c w_73d71d2 w_6df47c6
1,e141fd305.jpg,new_whale w_a10e633 w_76a45de w_1788910 w_be5ee58
2,25045eeda.jpg,w_5d5c6a6 new_whale w_9da1811 w_00904a7 w_56fe328
3,d11ed8266.jpg,w_f765256 new_whale w_aa0dacc w_a8b34a0 w_23e1d57
4,98e1ea193.jpg,w_f971ecb w_f47f214 new_whale w_51e7506 w_aa0dacc


In [21]:
pd.read_csv(f'../subs/{name}.csv.gz').Id.str.split().apply(lambda x: x[0] == 'new_whale').mean()

0.13655778894472362

In [23]:
!kaggle competitions submit -c humpback-whale-identification -f ../subs/{name}.csv.gz -m "{name}"

100%|████████████████████████████████████████| 170k/170k [00:07<00:00, 23.8kB/s]
Successfully submitted to Humpback Whale Identification