In [None]:
# This is required due to this error: https://www.kaggle.com/product-feedback/279990
!pip install --user torch==1.9.0 > /dev/null 2>&1

In [None]:
from fastai.vision.all import *

# Intro

I'm going to train a model to classify whether the example came from the train or test set. If the distribution in the train and test set is exactly the same, we expect an ROC of about 0.5. Any higher than that suggests that there is something quite different about the test set.

I'm using the processed dataset [here](https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/307287) to speed up training.

# Params and Dataset

In [None]:
SEED = 420
IMG_PATH_BASE = '../input/happy-whale-512'
IMG_SIZE = 224
BS = 64
ARCH = resnet18

In [None]:
train_df = pd.read_csv(f'{IMG_PATH_BASE}/train.csv')
test_df = pd.read_csv(f'{IMG_PATH_BASE}/sample_submission.csv')

train_df['image_path'] = f'{IMG_PATH_BASE}/train_images/' + train_df.image
train_df['is_test'] = False

test_df['image_path'] = f'{IMG_PATH_BASE}/test_images/' + test_df.image
test_df['is_test'] = True

# Remove corrupt examples

In [None]:
from tqdm.notebook import tqdm

def remove_corrupt_examples(df):
    valid_rows = []
    num = 0
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        try:
            Image.open(row.image_path)
            valid_rows.append(row)
        except Exception:
            num += 1
            continue

    print(f'Found {num} corrupt examples')
    
    return pd.DataFrame(valid_rows)

In [None]:
train_df = remove_corrupt_examples(train_df)
test_df = remove_corrupt_examples(test_df)

In [None]:
all_df = pd.concat([
    train_df[['image_path', 'is_test']], test_df[['image_path', 'is_test']]]
).reset_index(drop=True).sample(frac=1., random_state=SEED)

In [None]:
all_df.is_test.value_counts()

I'm using fastai library for training as we can do so much with very little code.

# Setup Data

[Datablock](https://docs.fast.ai/data.block.html) is a tool that let's you create a Dataset from configuration.

In [None]:
datablock = DataBlock(
    blocks=(ImageBlock, CategoryBlock),
    getters=[
        ColReader('image_path'), ColReader('is_test')
    ],
    splitter=RandomSplitter(seed=SEED),
    item_tfms=Resize(IMG_SIZE),
    batch_tfms=aug_transforms(size=IMG_SIZE, max_rotate=30., min_scale=0.75, flip_vert=True, do_flip=True)
)

In [None]:
dls = datablock.dataloaders(source=all_df, bs=BS)

In [None]:
dls.show_batch()

# Model and training

In [None]:
def get_learner(dls, lr=1e-3):
    opt_func = partial(Adam, lr=lr, wd=0.01, eps=1e-8)

    learn = cnn_learner(
        dls, ARCH, opt_func=opt_func,
        metrics=[RocAucBinary()]).to_fp16()

    return learn

In [None]:
learn = get_learner(dls)

I previously ran lr_find to get the learning rate.

In [None]:
# learn.lr_find()

In [None]:
learn.fit_one_cycle(1)
learn.unfreeze()
learn.fit_one_cycle(4, slice(1e-4, 1e-3))

# Result

In [None]:
loss, metric = learn.validate()

In [None]:
metric

# Compare Result to Random Baseline

Since the AUC is > 0.5, it indicates that there may be some signal allowing a model to differentiate the sets.

However, we should first get the AUC if we train a model with a "fake" test set. We make believe 20% of examples from train are test set, train a model on that and get a metric reading. If it's roughly the same as what we saw before, we know that the test set is from the same distribution as train.

In [None]:
fake_train = train_df.copy()
fake_train['is_test'] = False
fake_train_samp = fake_train.sample(frac=0.2)
fake_train_samp['is_test'] = True
fake_train.update(fake_train_samp)

In [None]:
fake_train.is_test.value_counts()

In [None]:
datablock = DataBlock(
    blocks=(ImageBlock, CategoryBlock),
    getters=[
        ColReader('image_path'), ColReader('is_test')
    ],
    splitter=RandomSplitter(seed=SEED),
    item_tfms=Resize(IMG_SIZE),
    batch_tfms=aug_transforms(size=IMG_SIZE, max_rotate=30., min_scale=0.75, flip_vert=True, do_flip=True)
)
dls = datablock.dataloaders(source=fake_train, bs=BS)
learn = get_learner(dls)
learn.fit_one_cycle(1)
learn.unfreeze()
learn.fit_one_cycle(4, slice(1e-4, 1e-3))

In [None]:
loss, baseline_metric = learn.validate()

## Random Baseline Result

In [None]:
baseline_metric

# Conclusion

So train and test set do have some minor differences. An ROC-AUC of > 0.5 is indicative of some signal to distinguish between the 2 sets, but not a lot.