Code taken from https://github.com/radekosmulski/whale/blob/master/oversample.ipynb

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage

import pandas as pd
from torch import optim
import re

from utils import *

In [2]:
import fastai
from fastprogress import force_console_behavior
import fastprogress
fastprogress.fastprogress.NO_BAR = True
master_bar, progress_bar = force_console_behavior()
fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

In [3]:
MODEL_PATH = "../model/"

In [4]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [5]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [6]:
val_fns = pd.read_pickle('../data/10_val_fns')

In [7]:
SZ = 224
BS = 64
NUM_WORKERS = 16
SEED=0

In [19]:
name = '14-res50-full-train'

In [9]:
df = pd.read_csv('../data/10_oversampled_train.csv')

In [10]:
SZ = 224 * 2
BS = 64 // 4
NUM_WORKERS = 16
SEED=0

In [14]:
%%time

learn.fit_one_cycle(10, 1e-2 / 4)
learn.save(f'{name}-stage-3-10')

epoch     train_loss  valid_loss
1         0.148802    4.312988    
2         0.114246    3.553153    
3         0.303080    3.802882    
4         0.315811    3.498682    
5         0.245423    3.212538    
6         0.185929    3.005838    
7         0.061866    2.709668    
8         0.014739    2.369547    
9         0.008057    2.195241    
10        0.005707    2.136926    
Total time: 2:37:28
CPU times: user 1h 50min 14s, sys: 46min 47s, total: 2h 37min 1s
Wall time: 2h 37min 29s


In [14]:
# with oversampling
df = pd.read_csv('../data/10_oversampled_train_and_val.csv')

In [15]:
data = (
    ImageItemList
        .from_df(df, '../data/train-extracted-448', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test-extracted-448'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [16]:
data

ImageDataBunch;

Train: LabelList (61171 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_0003639,w_0003639,w_0003639,w_0003639,w_0003639
Path: ../data/train-extracted-448;

Valid: LabelList (15116 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_0027efa,w_00289b1,w_00289b1,w_00289b1,w_00289b1
Path: ../data/train-extracted-448;

Test: LabelList (7960 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: EmptyLabelList
,,,,
Path: ../data/train-extracted-448

In [18]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048], model_dir=MODEL_PATH)
learn.clip_grad();
learn.load(f'13-res50-full-train-stage-3-10')
learn.freeze_to(-1)

CPU times: user 1.84 s, sys: 341 ms, total: 2.18 s
Wall time: 1.39 s


In [20]:
%%time

learn.fit_one_cycle(2, 1e-2 / 4)
learn.save(f'{name}-stage-5')

epoch     train_loss  valid_loss
1         1.298463    4.695878    
2         0.283940    3.454090    
Total time: 27:48
CPU times: user 19min 15s, sys: 8min 22s, total: 27min 38s
Wall time: 27min 48s


In [21]:
%%time

learn.fit_one_cycle(4, 1e-2 / 4)
learn.save(f'{name}-stage-5-4')

epoch     train_loss  valid_loss
1         1.608441    4.988077    
2         1.384822    4.662437    
3         0.576583    3.719918    
4         0.278639    3.515070    
Total time: 55:29
CPU times: user 38min 43s, sys: 16min 35s, total: 55min 18s
Wall time: 55min 29s


In [23]:
name

'14-res50-full-train'

In [25]:
learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

In [27]:
learn.purge()

Learner(data=ImageDataBunch;

Train: LabelList (61171 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_0003639,w_0003639,w_0003639,w_0003639,w_0003639
Path: ../data/train-extracted-448;

Valid: LabelList (15116 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_0027efa,w_00289b1,w_00289b1,w_00289b1,w_00289b1
Path: ../data/train-extracted-448;

Test: LabelList (7960 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: EmptyLabelList
,,,,
Path: ../data/train-extracted-448, model=Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): MaxPool2d(kernel_size=

In [28]:
%%time

learn.fit_one_cycle(3, lrs)
learn.save(f'{name}-stage-6')

epoch     train_loss  valid_loss
1         0.401728    3.736249    
2         0.327870    3.630651    
3         0.224510    3.539768    
Total time: 54:16
CPU times: user 38min 2s, sys: 16min 6s, total: 54min 9s
Wall time: 54min 16s


In [29]:
preds, _ = learn.get_preds(DatasetType.Test)

In [30]:
preds = torch.cat((preds, torch.ones_like(preds[:, :1])), 1)

In [31]:
preds[:, 5004] = 0.06

In [32]:
classes = learn.data.classes + ['new_whale']

In [33]:
create_submission(preds, learn.data, name, classes)

In [34]:
pd.read_csv(f'../subs/{name}.csv.gz').head()

Unnamed: 0,Image,Id
0,ef60d186c.jpg,new_whale w_3b03149 w_2df85e7 w_55b77ce w_de657c1
1,e141fd305.jpg,new_whale w_76a45de w_57acd97 w_f765256 w_4516ff1
2,25045eeda.jpg,w_5d5c6a6 new_whale w_8d9d59a w_778e474 w_fbc7895
3,d11ed8266.jpg,w_f765256 new_whale w_59052ad w_eadfd82 w_ab2ac77
4,98e1ea193.jpg,new_whale w_832917d w_584e1dc w_4690940 w_1eb67e3


In [None]:
pd.read_csv(f'../subs/{name}.csv.gz').Id.str.split().apply(lambda x: x[0] == 'new_whale').mean()

In [None]:
!kaggle competitions submit -c humpback-whale-identification -f ../subs/{name}.csv.gz -m "{name}"