## Make the folder structure

In [1]:
import os

This assumes we are in the deeplearning1/nbs folder of the fastai course. 

In [2]:
def make_dir(folder, cd=True):
    """Makes folder if it doesn't exist
    
    Checks if we are already below the folder.
    If not, the folder is created, and we change into it.
    This is an idempotent operation, so the notebook can be run as a whole.
    """
    while True:
        try:
            curr_path = os.getcwd()
        except OSError:
            os.chdir('..')
        else:
            break
        
    if folder not in curr_path.split('/'):
        !mkdir $folder
        if cd:
            os.chdir(folder)

### Desired Folder structure

```
data  
    invasive
        test
        train  
            inva  
            not  
        valid  
            inva  
            not  
```

In [3]:
make_dir('data')
make_dir('invasive')
make_dir('test', cd=False)
make_dir('train')
make_dir('inva', cd=False)
make_dir('not', cd=False)

os.chdir('..')
make_dir('valid')
make_dir('inva', cd=False)
make_dir('not', cd=False)

os.chdir('..') # back to invasive folder

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘invasive’: File exists
mkdir: cannot create directory ‘test’: File exists
mkdir: cannot create directory ‘train’: File exists
mkdir: cannot create directory ‘inva’: File exists
mkdir: cannot create directory ‘not’: File exists
mkdir: cannot create directory ‘valid’: File exists
mkdir: cannot create directory ‘inva’: File exists
mkdir: cannot create directory ‘not’: File exists


## Get the competition data from kaggle

In [4]:
os.getcwd()  # I expect to be in the invasive folder

'/home/ubuntu/courses/deeplearning1/nbs/data/invasive'

In [5]:
# This assumes that you have joined the competition, and that you have run the command 
# kg config -u kaggle_username -p kaggle_password

!kg download -c invasive-species-monitoring

Starting new HTTPS connection (1): www.kaggle.com
downloading https://www.kaggle.com/c/invasive-species-monitoring/download/test.7z

Starting new HTTPS connection (1): storage.googleapis.com
test.7z already downloaded !
downloading https://www.kaggle.com/c/invasive-species-monitoring/download/train.7z

train.7z already downloaded !
downloading https://www.kaggle.com/c/invasive-species-monitoring/download/train_labels.csv.zip

train_labels.csv.zip already downloaded !
downloading https://www.kaggle.com/c/invasive-species-monitoring/download/sample_submission.csv.zip

sample_submission.csv.zip already downloaded !


In [6]:
!sudo apt-get install p7zip-full

Reading package lists... Done
Building dependency tree       
Reading state information... Done
p7zip-full is already the newest version.
0 upgraded, 0 newly installed, 0 to remove and 139 not upgraded.


In [7]:
%%capture
!7z x test.7z -aoa
!7z x train.7z -aoa
!unzip train_labels.csv.zip -o
!unzip sample_submission.csv.zip -o 

In [8]:
!ls

__MACOSX		   test     train.7z		  valid
sample_submission.csv	   test.7z  train_labels.csv
sample_submission.csv.zip  train    train_labels.csv.zip


## Split training data into category subfolders

In [9]:
import pandas as pd

In [10]:
labels = pd.read_csv('train_labels.csv')
labels.head()

Unnamed: 0,name,invasive
0,1,0
1,2,0
2,3,1
3,4,0
4,5,1


In [11]:
invasive_files = !ls train
invasive_files[:12]

['1000.jpg',
 '1001.jpg',
 '1002.jpg',
 '1003.jpg',
 '1004.jpg',
 '1005.jpg',
 '1006.jpg',
 '1007.jpg',
 '1008.jpg',
 '1009.jpg',
 '100.jpg',
 '1010.jpg']

In [None]:
for number, invasive in labels.itertuples(index=False):
    number = str(number)
    if invasive:
        folder = 'inva'
    else:
        folder = 'not'
    !mv train/'$number'.jpg train/$folder


In [1]:
!ls train

ls: cannot access train: No such file or directory


## Make validation sets

In [55]:
def num_files(folder):
    """Count the files in a folder"""
    number, = !ls -1q $folder | wc -l
    return int(number)

In [33]:
num_files('train/inva'), num_files('train/not')

('1448', '847')

We'll grab 100 files from each to form the validation set.

In [35]:
validations = 100

In [50]:
import random

In [56]:
def create_valid(train_folder, num_valids):
    """Create a validation set"""
    subfolders = !ls train
    for subf in subfolders:
        subf_list = !ls train/$subf
        subf_count = num_files('valid/'+subf)
        if subf_count == 0:
            for file_ in random.sample(subf_list, num_valids):
                !mv train/$subf/$file_ valid/$subf/$file_
        elif subf_count != num_valids:
            raise ValueError(' '.join(['Unexpected number files in validation folder.', 
                               '\nExpected', str(num_valids), 'Got', str(subf_count)])
                    )
        print(subf, subf_count)

In [57]:
create_valid('train', validations)

('inva', 100)
('not', 100)
