Code taken from https://github.com/radekosmulski/whale/blob/master/oversample.ipynb

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage

import pandas as pd
from torch import optim
import re

from utils import *

In [2]:
import fastai
from fastprogress import force_console_behavior
import fastprogress
fastprogress.fastprogress.NO_BAR = True
master_bar, progress_bar = force_console_behavior()
fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

In [3]:
MODEL_PATH = "../model/"

In [4]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [5]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [6]:
val_fns = pd.read_pickle('../data/10_val_fns')

In [7]:
SZ = 224
BS = 64
NUM_WORKERS = 16
SEED=0

In [8]:
name = '11-res50-full-train'

In [9]:
df = pd.read_csv('../data/10_oversampled_train.csv')

In [10]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], '../data/train-extracted-224', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test-extracted-224'))
        .transform(get_transforms(do_flip=False, max_zoom=1, max_warp=0, max_rotate=2), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [11]:
data

ImageDataBunch;

Train: LabelList (76174 items)
x: ImageItemList
Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224)
y: CategoryList
w_0003639,w_0003639,w_0003639,w_0003639,w_0003639
Path: ../data/train-extracted-224;

Valid: LabelList (2931 items)
x: ImageItemList
Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224)
y: CategoryList
w_cb622a2,w_8dddbee,w_8a6a8d5,w_3881f28,w_cee684e
Path: ../data/train-extracted-224;

Test: LabelList (7960 items)
x: ImageItemList
Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224),Image (3, 224, 224)
y: EmptyLabelList
,,,,
Path: ../data/train-extracted-224

In [12]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048], model_dir=MODEL_PATH, metrics=[accuracy, map5])
learn.clip_grad();

CPU times: user 4.3 s, sys: 950 ms, total: 5.25 s
Wall time: 3.88 s


In [13]:
%%time

learn.fit_one_cycle(24, 1e-2)
learn.save(f'{name}-stage-1')

epoch     train_loss  valid_loss  accuracy  map5    
1         1.001561    6.261451    0.120778  0.160230  
2         0.760604    6.496376    0.146025  0.185727  
3         1.068263    7.850545    0.151484  0.196776  
4         0.884718    7.761956    0.200273  0.253457  
5         0.671748    6.976810    0.219720  0.276658  
6         0.565592    6.803068    0.250768  0.309519  
7         0.556764    7.263548    0.276015  0.336654  
8         0.438798    7.460213    0.306721  0.369658  
9         0.382827    7.761718    0.319686  0.383402  
10        0.342217    7.476464    0.347663  0.413056  
11        0.314645    7.364031    0.369157  0.432549  
12        0.269172    7.358661    0.385534  0.451097  
13        0.177433    7.373550    0.388946  0.451700  
14        0.134167    6.715727    0.426817  0.488610  
15        0.119350    6.276331    0.456499  0.518111  
16        0.050575    5.648981    0.472876  0.533458  
17        0.028072    5.464215    0.499147  0.560338  
18        0.

In [14]:
learn.unfreeze()

max_lr = 1e-4
lrs = [max_lr/100, max_lr/10, max_lr]

In [15]:
%%time

learn.fit_one_cycle(24, lrs)
learn.save(f'{name}-stage-2')

epoch     train_loss  valid_loss  accuracy  map5    
1         0.000107    3.860171    0.580348  0.635841  
2         0.000503    3.816877    0.586830  0.641192  
3         0.001148    3.832558    0.586489  0.641584  
4         0.001560    3.862892    0.584783  0.640185  
5         0.000835    3.901370    0.575913  0.634618  
6         0.000117    3.861731    0.585807  0.641340  
7         0.000341    3.860297    0.584783  0.638809  
8         0.000436    3.841269    0.586830  0.639059  
9         0.000120    3.788567    0.586148  0.640902  
10        0.000230    3.798464    0.590583  0.644063  
11        0.000705    3.746600    0.593654  0.645127  
12        0.000312    3.727324    0.596384  0.647310  
13        0.000117    3.735152    0.597748  0.648169  
14        0.000163    3.723325    0.595701  0.648266  
15        0.000020    3.672943    0.597066  0.651160  
16        0.000018    3.664675    0.593654  0.649016  
17        0.000086    3.644717    0.596042  0.649704  
18        0.

In [16]:
SZ = 224 * 2
BS = 64 // 4
NUM_WORKERS = 16
SEED=0

In [17]:
data = (
    ImageItemList
        .from_df(df[df.Id != 'new_whale'], '../data/train-extracted-448', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test-extracted-448'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [18]:
data

ImageDataBunch;

Train: LabelList (76174 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_0003639,w_0003639,w_0003639,w_0003639,w_0003639
Path: ../data/train-extracted-448;

Valid: LabelList (2931 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_cb622a2,w_8dddbee,w_8a6a8d5,w_3881f28,w_cee684e
Path: ../data/train-extracted-448;

Test: LabelList (7960 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: EmptyLabelList
,,,,
Path: ../data/train-extracted-448

In [None]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048], model_dir=MODEL_PATH, metrics=[accuracy, map5])
learn.clip_grad();
learn.load(f'{name}-stage-2')
learn.freeze_to(-1)

CPU times: user 2.42 s, sys: 414 ms, total: 2.84 s
Wall time: 1.5 s


In [None]:
%%time

learn.fit_one_cycle(12, 1e-2 / 4)
learn.save(f'{name}-stage-3')

epoch     train_loss  valid_loss  accuracy  map5    
1         0.585483    4.332168    0.472876  0.526612  
2         0.253125    3.228748    0.546230  0.600807  
3         0.543139    3.262885    0.514841  0.566678  
4         0.946631    3.263905    0.507335  0.557466  
5         0.845570    3.294701    0.504947  0.557876  
6         0.828406    3.206880    0.508700  0.559189  
7         0.598875    3.101036    0.543500  0.590737  
8         0.537704    2.916110    0.570795  0.615535  
9         0.414187    2.795772    0.622654  0.664227  
10        0.412356    3.037873    0.651314  0.690174  
11        0.539101    3.383638    0.699420  0.735870  
12        0.453008    3.523456    0.714091  0.747111  
Total time: 3:09:47
CPU times: user 2h 21min 39s, sys: 56min 36s, total: 3h 18min 15s
Wall time: 3h 9min 47s


In [None]:
%%time

learn.unfreeze()

max_lr = 1e-4 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

CPU times: user 486 µs, sys: 2.7 ms, total: 3.19 ms
Wall time: 3.19 ms


In [None]:
learn.fit_one_cycle(22, lrs)
learn.save(f'{name}-stage-4')

epoch     train_loss  valid_loss  accuracy  map5    
1         0.462944    3.498866    0.710679  0.746554  
2         0.467732    3.536111    0.720914  0.754503  
3         0.460115    3.600281    0.727738  0.760213  
4         0.480248    3.632361    0.732173  0.764301  
5         0.577812    3.715545    0.740362  0.771062  
6         0.647468    3.872792    0.735585  0.770630  


In [None]:
# with oversampling
df = pd.read_csv('../data/10_oversampled_train_and_val.csv')

In [None]:
data = (
    ImageItemList
        .from_df(df, '../data/train-extracted-448', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test-extracted-448'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [None]:
data

In [None]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048], model_dir=MODEL_PATH, metrics=[accuracy, map5])
learn.clip_grad();
learn.load(f'{name}-stage-4')
learn.freeze_to(-1)

In [None]:
%%time

learn.fit_one_cycle(2, 1e-2 / 4)
learn.save(f'{name}-stage-5')

In [None]:
learn.unfreeze()

max_lr = 1e-4 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

In [None]:
%%time

learn.fit_one_cycle(3, lrs)
learn.save(f'{name}-stage-6')

In [None]:
preds, _ = learn.get_preds(DatasetType.Test)

In [None]:
preds = torch.cat((preds, torch.ones_like(preds[:, :1])), 1)

In [None]:
preds[:, 5004] = 0.06

In [None]:
classes = learn.data.classes + ['new_whale']

In [None]:
create_submission(preds, learn.data, name, classes)

In [None]:
pd.read_csv(f'../subs/{name}.csv.gz').head()

In [None]:
pd.read_csv(f'../subs/{name}.csv.gz').Id.str.split().apply(lambda x: x[0] == 'new_whale').mean()

In [None]:
!kaggle competitions submit -c humpback-whale-identification -f subs/{name}.csv.gz -m "{name}"