In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import gc
gc.enable()
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (8.0, 5.0)

import warnings
warnings.filterwarnings("ignore")

from fastai import *
from fastai.vision import *

from utils import *

In [None]:
path = Path('../input/humpback-whale-identification/')
path_test = Path('../input/humpback-whale-identification/test')
path_train = Path('../input/humpback-whale-identification/train')

In [None]:
train_df=pd.read_csv(path/'train.csv')
val_fns = {'69823499d.jpg'}

In [None]:
print("Train Shape : ",train_df.shape)

In [None]:
print("No of Whale Classes : ",len(train_df.Id.value_counts()))

In [None]:
train_df.Id.value_counts().head()

In [None]:
(train_df.Id == 'new_whale').mean()

In [None]:
(train_df.Id.value_counts() == 1).mean()

41% of all whales have only a single image associated with them.

38% of all images contain a new whale - a whale that has not been identified as one of the known whales.

In [None]:
fn2label = {row[1].Image: row[1].Id for row in train_df.iterrows()}
path2fn = lambda path: re.search('\w*\.jpg$', path).group(0)

In [None]:
gc.collect()

In [None]:
name = f'densenet169'

SZ = 224
BS = 64
NUM_WORKERS = 0
SEED=0

In [None]:
data = (
    ImageItemList
        .from_df(train_df[train_df.Id != 'new_whale'],path_train, cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder(path_test))
        .transform(get_transforms(do_flip=False, max_zoom=1, max_warp=0, max_rotate=2), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path=path)
).normalize(imagenet_stats)

In [None]:
data.show_batch(rows=3)

In [None]:
learn = create_cnn(data, models.densenet169, lin_ftrs=[2048], model_dir='../working/')
learn.clip_grad()

In [None]:
gc.collect()

In [None]:
SZ = 224 * 2
BS = 64 // 4
NUM_WORKERS = 0
SEED=0

In [None]:
df = pd.read_csv('../input/oversample-whale/oversampled_train_and_val.csv')

In [None]:
data = (
    ImageItemList
        .from_df(df, path_train, cols=['Image'])
        .split_by_valid_func(lambda path: path2fn(path) in val_fns)
        .label_from_func(lambda path: fn2label[path2fn(path)])
        .add_test(ImageItemList.from_folder(path_test))
        .transform(get_transforms(do_flip=False, max_zoom=1, max_warp=0, max_rotate=2), size=SZ, resize_method=ResizeMethod.SQUISH)
        .databunch(bs=BS, num_workers=NUM_WORKERS, path=path)
        .normalize(imagenet_stats)
)

In [None]:
learn = create_cnn(data, models.densenet169, lin_ftrs=[2048], model_dir='../working/')

In [None]:
learn.fit_one_cycle(1, slice(6.92E-06))

In [None]:
gc.collect()
learn.save('stage-1')

In [None]:
gc.collect()

In [None]:
preds, _ = learn.get_preds(DatasetType.Test)
preds = torch.cat((preds, torch.ones_like(preds[:, :1])), 1)
preds[:, 5004] = 0.06

classes = learn.data.classes + ['new_whale']

In [None]:
def top_5_preds(preds): return np.argsort(preds.numpy())[:, ::-1][:, :5]

def top_5_pred_labels(preds, classes):
    top_5 = top_5_preds(preds)
    labels = []
    for i in range(top_5.shape[0]):
        labels.append(' '.join([classes[idx] for idx in top_5[i]]))
    return labels

def create_submission(preds, data, name, classes=None):
    if not classes: classes = data.classes
    sub = pd.DataFrame({'Image': [path.name for path in data.test_ds.x.items]})
    sub['Id'] = top_5_pred_labels(preds, classes)
    sub.to_csv(f'{name}.csv', index=False)

In [None]:
create_submission(preds, learn.data, name, classes)