Code taken from https://github.com/radekosmulski/whale/blob/master/oversample.ipynb

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from fastai.vision import *
from fastai.metrics import accuracy
from fastai.basic_data import *
from skimage.util import montage

import pandas as pd
from torch import optim
import re

from utils import *

In [2]:
import fastai
from fastprogress import force_console_behavior
import fastprogress
fastprogress.fastprogress.NO_BAR = True
master_bar, progress_bar = force_console_behavior()
fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

In [3]:
MODEL_PATH = "../model/"

In [4]:
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


In [5]:
fn2label = {row[1].Image: row[1].Id for row in df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [6]:
val_fns = pd.read_pickle('../data/10_val_fns')

In [7]:
SZ = 224
BS = 64
NUM_WORKERS = 16
SEED=0

In [8]:
name = '15-res50-full-train'

In [9]:
df = pd.read_csv('../data/10_oversampled_train.csv')

In [10]:
SZ = 224 * 2
BS = 64 // 4
NUM_WORKERS = 16
SEED=0

In [11]:
# with oversampling
df = pd.read_csv('../data/10_oversampled_train_and_val.csv')

In [12]:
data = (
    ImageItemList
        .from_df(df, '../data/train-extracted-448', cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder('../data/test-extracted-448'))
        .transform(get_transforms(do_flip=False), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path='../data')
        .normalize(imagenet_stats)
)

In [13]:
data

ImageDataBunch;

Train: LabelList (61171 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_0003639,w_0003639,w_0003639,w_0003639,w_0003639
Path: ../data/train-extracted-448;

Valid: LabelList (15116 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: CategoryList
w_0027efa,w_00289b1,w_00289b1,w_00289b1,w_00289b1
Path: ../data/train-extracted-448;

Test: LabelList (7960 items)
x: ImageItemList
Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448),Image (3, 448, 448)
y: EmptyLabelList
,,,,
Path: ../data/train-extracted-448

In [14]:
%%time

learn = create_cnn(data, models.resnet50, lin_ftrs=[2048], model_dir=MODEL_PATH)
learn.clip_grad();
learn.load(f'14-res50-full-train-stage-6')
learn.freeze_to(-1)

CPU times: user 3.81 s, sys: 1.37 s, total: 5.19 s
Wall time: 4.46 s


In [15]:
%%time

learn.fit_one_cycle(11, 1e-2 / 4)
learn.save(f'{name}-stage-5-4')

epoch     train_loss  valid_loss
1         0.636203    4.144199    
2         1.619443    4.927876    
3         1.624263    5.654509    
4         1.297428    4.757435    
5         1.022825    4.574315    
6         0.811387    4.094804    
7         0.747947    4.342099    
8         0.611589    3.912366    
9         0.583150    4.421908    
10        0.716768    5.022101    
11        0.549895    5.468246    
Total time: 2:32:10
CPU times: user 1h 44min 44s, sys: 46min 52s, total: 2h 31min 36s
Wall time: 2h 32min 10s


In [16]:
learn.unfreeze()

max_lr = 1e-3 / 4
lrs = [max_lr/100, max_lr/10, max_lr]

In [17]:
%%time

learn.fit_one_cycle(12, lrs)
learn.save(f'{name}-stage-6')

epoch     train_loss  valid_loss
1         0.548004    5.010891    
2         1.101178    5.161486    
3         1.606798    5.655044    
4         1.718730    5.398075    
5         1.718766    5.249941    
6         1.593790    5.210988    
7         1.453841    5.055051    
8         1.490712    5.090242    
9         1.500904    5.311392    
10        1.389253    5.501446    
11        1.571398    5.660374    
12        1.270662    5.696500    
Total time: 3:36:57
CPU times: user 2h 29min 50s, sys: 1h 6min 36s, total: 3h 36min 27s
Wall time: 3h 36min 57s


In [18]:
preds, _ = learn.get_preds(DatasetType.Test)

In [19]:
preds = torch.cat((preds, torch.ones_like(preds[:, :1])), 1)

In [20]:
preds[:, 5004] = 0.06

In [21]:
classes = learn.data.classes + ['new_whale']

In [22]:
create_submission(preds, learn.data, name, classes)

In [23]:
pd.read_csv(f'../subs/{name}.csv.gz').head()

Unnamed: 0,Image,Id
0,ef60d186c.jpg,new_whale w_d0528f6 w_0bc078c w_789c969 w_580ba51
1,e141fd305.jpg,new_whale w_76a45de w_0e2a5bd w_bfcad53 w_57acd97
2,25045eeda.jpg,new_whale w_5d5c6a6 w_700ebb4 w_778e474 w_60cf87c
3,d11ed8266.jpg,w_f765256 new_whale w_59052ad w_eba33fb w_0135f5f
4,98e1ea193.jpg,new_whale w_8da30ad w_71ed685 w_b035775 w_685b8e1


In [24]:
pd.read_csv(f'../subs/{name}.csv.gz').Id.str.split().apply(lambda x: x[0] == 'new_whale').mean()

0.6987437185929648

In [25]:
!kaggle competitions submit -c humpback-whale-identification -f ../subs/{name}.csv.gz -m "{name}"

100%|████████████████████████████████████████| 160k/160k [00:12<00:00, 13.4kB/s]
Successfully submitted to Humpback Whale Identification