## Memo
* Install kaggle-cli, set competition `kg -c 'compname'` and download dataset `kg download`
* move all `cat*` to `cats/`, same for dogs
* take 1000 random pictures and move them to valid/cats/ with `shuf -n 1000 -e train/cat* | xargs -i mv {} valid/cats/`, same for dogs
Furthermore use
* `ls /train/cats/ | wc -l` to count files in `/train/cats/`
* `ls /train/cats/ | grep -v 'cat'` to find all files that DON'T have cat in it. `-v`-flag inverts search.
* use: `mv /train/cat* /valid/cats` and `cp` to do the rest
* copy utils etc: `cp -t ~/fastai/ vgg16.py vgg16bn.py utils.py resnet50.py`

In [13]:
import csv
import shutil
path = "data/invasiveplants/"
# path = "data/invasiveplants/trial/"

In [11]:
def get_invasiveness(csvfile):
    plantfile = []
    both = []
    labels = []
    with open(path+'train_labels.csv', 'rb') as f:
        train_labels = csv.reader(f, delimiter=' ')
        temp = next(train_labels)
        for row in train_labels:
            both.append(row)
            plantfile.append(int(row[0][:-2]))
            labels.append(int(row[0][-1:]))
    invasive = [str(plantfile[i])+'.jpg' for i,x in enumerate(labels) if x==1]
    non_invasive = [str(plantfile[i])+'.jpg' for i,x in enumerate(labels) if x==0]
    return invasive, non_invasive

In [43]:
def copy_files():
    for i in range(len(invasive)):
        shutil.copy2(path+"train_unordered/"+invasive[i], path+"train/invasive/")
    for j in range(len(non_invasive)):
        shutil.copy2(path+"train_unordered/"+non_invasive[j], path+"train/non_invasive/")

In [None]:
from __future__ import division,print_function

import os, json
from glob import glob
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
import csv
import math

import utils; reload(utils)
from utils import plots
import vgg16; reload(vgg16)
from vgg16 import Vgg16

batch_size = 8

vgg = Vgg16()
# Grab a few images at a time for training and validation.
# NB: They must be in subdirectories named based on their category
batches = vgg.get_batches(path+'train', batch_size=batch_size)
val_batches = vgg.get_batches(path+'valid', batch_size=batch_size*2)
test_batches = vgg.get_batches(path+'temptest', batch_size=batch_size, shuffle=False)
vgg.finetune(batches)
vgg.fit(batches, val_batches, nb_epoch=2)

Found 2111 images belonging to 2 classes.
Found 184 images belonging to 2 classes.
Found 1531 images belonging to 1 classes.
Epoch 1/2
  64/2111 [..............................] - ETA: 832s - loss: 1.0706 - acc: 0.6562

In [3]:
def batch_dogness(test_batches):
    imgs,nolabel = next(test_batches)
    preds, idxs, labels = vgg.predict(imgs)
    dogness = np.multiply(preds, idxs) + np.multiply(1.-preds, 1-idxs)
    return dogness

In [10]:
def predict_order_testset(testfolder):
#     files_names = os.listdir(path+testfolder)
    files_names = test_batches.filenames
    files_names = [int(files_names[i][5:-4]) for i in range(len(files_names))]
    N = int(math.ceil(len(files_names)/batch_size))
#     predictions = [x for i in range(N) for x in batch_dogness(test_batches).tolist()]
    predictions = [x for i in range(N) for x in batch_dogness(test_batches).clip(0.01,0.99).tolist()] #Use clipping because of log loss judgement by Kaggle
    predictions_ordered = [x for (y,x) in sorted(zip(files_names,predictions))]
    return predictions_ordered

In [11]:
def write_csv(predictions_ordered):
    with open('test.csv', 'wb') as f:
        fieldnames = ['id', 'label']
        wr = csv.writer(f, delimiter=',')
        wr.writerow(fieldnames)
        for i in range(len(predictions_ordered)):
            wr.writerow([i+1,predictions_ordered[i]])

In [12]:
predictions = predict_order_testset('temptest/test')
write_csv(predictions)