In [2]:
import os

import random

## Get the Kaggle Data

### Run the Kaggle CLI tool and collect Cats vs Dogs

In [5]:
while 'nbs' != os.getcwd().split('/')[-1]:
    os.chdir('..')

try:
    os.makedirs('data')
except OSError:
    pass
os.chdir('data')

try:
    os.makedirs('dogscats')
except OSError:
    pass
os.chdir('dogscats')    

In [3]:
!pip install kaggle-cli



In [4]:
!kg download -c dogs-vs-cats-redux-kernels-edition

downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/test.zip

test.zip already downloaded !
downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/train.zip

train.zip already downloaded !
downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/sample_submission.csv

sample_submission.csv already downloaded !


In [7]:
%%capture
!unzip train.zip

In [8]:
%%capture
!unzip test.zip

## Split Train into Cats and Dogs

In [9]:
!mkdir train/cats
!mkdir train/dogs

In [10]:
!mv train/cat.* train/cats

In [11]:
!mv train/dog.* train/dogs

## Validation set prep

For this, we want to remove a set of $n$ images from the kaggle provided image sets, and set them aside for model and parameter validation. We do this for both the cats and dogs set of images.

In [12]:
!mkdir valid

In [13]:
n = 1000

### Cats

In [14]:
!mkdir valid/cats

List the cat files

In [15]:
cats = !ls train/cats
cats[:5]

['cat.0.jpg',
 'cat.10000.jpg',
 'cat.10001.jpg',
 'cat.10002.jpg',
 'cat.10003.jpg']

Sample the cat files

In [16]:
random.sample(cats, n)[:5]

['cat.7701.jpg',
 'cat.8852.jpg',
 'cat.4470.jpg',
 'cat.315.jpg',
 'cat.12063.jpg']

Verify 1000 cat images are in valid/cats. If there aren't, move 1000 over to it.

In [17]:
# bash command found here 
# https://stackoverflow.com/questions/11307257/is-there-a-bash-command-which-counts-files

num_cat_images, = !ls -1q valid/cats | wc -l
num_cat_images = int(num_cat_images)
num_cat_images

0

In [18]:
if num_cat_images == 0:
    for file_ in random.sample(cats, n):
        !mv train/cats/$file_ valid/cats/$file_
elif num_cat_images != n:
    raise ValueError(' '.join(['Unexpected number of cats in valid/cats folder.', 
                               '\nExpected', str(n), 'Got', str(num_cat_images)])
                    )

Verify 1000 are removed from train/cats. The original set of cat images contained 12500 cat images.

In [19]:
!ls -1q train/cats/* | wc -l

11500


### Dogs

In [20]:
!mkdir valid/dogs

List, move and verify dog file moves

In [21]:
num_dog_images, = !ls -1q valid/dogs | wc -l
num_dog_images = int(num_dog_images)
num_dog_images

0

In [22]:
dogs = !ls train/dogs

if num_dog_images == 0:
    for file_ in random.sample(dogs, n):
        !mv train/dogs/$file_ valid/dogs/$file_
elif num_dog_images != n:
    raise ValueError(' '.join(['Unexpected number of dogs in valid/dogs folder.', 
                               '\nExpected', str(n), 'Got', str(num_cat_images)])
                     )

In [23]:
!ls -1q train/dogs/* | wc -l

11500


### Sample set prep

For this, we want $m$ images from training, which we use to interatively train our models. The smaller set allows us to move much quicker. These images can just be copied from the training set. 

In [6]:
to_make_folders = ['sample', 'sample/train', 'sample/train/cats', 'sample/train/dogs', 'sample/test',
                  'sample/valid', 'sample/valid/cats', 'sample/valid/dogs']

for folder in to_make_folders:
    !mkdir $folder

mkdir: cannot create directory ‘sample’: File exists
mkdir: cannot create directory ‘sample/train’: File exists
mkdir: cannot create directory ‘sample/train/cats’: File exists
mkdir: cannot create directory ‘sample/train/dogs’: File exists
mkdir: cannot create directory ‘sample/test’: File exists
mkdir: cannot create directory ‘sample/valid’: File exists
mkdir: cannot create directory ‘sample/valid/cats’: File exists
mkdir: cannot create directory ‘sample/valid/dogs’: File exists


In [7]:
m = 100

In [8]:
for animal in ['dogs', 'cats']:
    train_count, = !ls -1q train/$animal | wc -l
    
    for kind in ['train', 'valid']:
        sample_count, = !ls -1q sample/$kind/$animal | wc -l
        print(animal, kind, sample_count, train_count)        
        if int(sample_count) == 0:
            animals = !ls train/$animal

            for file_ in random.sample(animals, m):
                !cp train/$animal/$file_ sample/$kind/$animal/$file_
        elif int(sample_count) != m:
            raise ValueError(' '.join(['Unexpected number of', animal, 'in sample folder.', 
                                   '\nExpected', str(m), 'Got', str(sample_count)])
                             )

('dogs', 'train', '100', '12500')
('dogs', 'valid', '100', '12500')
('cats', 'train', '100', '12500')
('cats', 'valid', '100', '12500')


In [9]:
sample_test_count, = !ls -1q sample/test/unknown | wc -l
print(sample_test_count)

if int(sample_test_count) == 0:
    test_animals = !ls test

    for file_ in random.sample(test_animals, 2*m):
        !cp test/$file_ sample/test/unknown/$file_
        
elif int(sample_test_count) != 2*m:
    raise ValueError(' '.join(['Unexpected number of test images in sample/test folder.', 
                           '\nExpected', str(2*m), 'Got', str(sample_test_count)])
                     )

200
