# Imports

In [0]:
import zipfile
import numpy as np
import pandas as pd
import os
import tqdm
import matplotlib 
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import io
import ast
from PIL import Image, ImageDraw

# Download data from Kaggle

In [2]:
! pip install kaggle --upgrade

Collecting kaggle
[?25l  Downloading https://files.pythonhosted.org/packages/83/9b/ac57e15fbb239c6793c8d0b7dfd1a4c4a025eaa9f791b5388a7afb515aed/kaggle-1.5.0.tar.gz (53kB)
[K    100% |████████████████████████████████| 61kB 5.3MB/s 
Collecting python-slugify (from kaggle)
  Downloading https://files.pythonhosted.org/packages/00/ad/c778a6df614b6217c30fe80045b365bfa08b5dd3cb02e8b37a6d25126781/python-slugify-1.2.6.tar.gz
Collecting Unidecode>=0.04.16 (from python-slugify->kaggle)
[?25l  Downloading https://files.pythonhosted.org/packages/59/ef/67085e30e8bbcdd76e2f0a4ad8151c13a2c5bce77c85f8cad6e1f16fb141/Unidecode-1.0.22-py2.py3-none-any.whl (235kB)
[K    100% |████████████████████████████████| 235kB 14.5MB/s 
[?25hBuilding wheels for collected packages: kaggle, python-slugify
  Running setup.py bdist_wheel for kaggle ... [?25l- \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/8b/21/3b/a0076243c6ae12a6215b2da515fe06b539aee7217b406e510e
  Running setup.py bdist_wheel for 

In [3]:
# get <token> from https://www.kaggle.com/<user>/account, click "Create New API Token", open json file
! KAGGLE_USERNAME=<> KAGGLE_KEY=<> kaggle competitions download -c quickdraw-doodle-recognition -f train_simplified.zip

Downloading train_simplified.zip to /content
100% 7.37G/7.37G [01:15<00:00, 118MB/s]
100% 7.37G/7.37G [01:15<00:00, 105MB/s]


In [0]:
# open zip file, will read everything from it
zf = zipfile.ZipFile("train_simplified.zip")

# Methods

## Look at data csvs

In [0]:
cnames = ['countrycode', 'drawing', 'key_id', 'recognized', 'timestamp', 'word']
drawlist = []
fnames = zf.namelist()

In [14]:
for fn in fnames[0:2]:
    with io.TextIOWrapper(zf.open(fn)) as f:
      first = pd.read_csv(f, nrows=10) # make sure we get a recognized drawing
      first = first[first.recognized==True].head(2)
      drawlist.append(first)
draw_df = pd.DataFrame(np.concatenate(drawlist), columns=cnames)
draw_df

Unnamed: 0,countrycode,drawing,key_id,recognized,timestamp,word
0,GB,"[[[14, 14], [82, 8]], [[55, 56], [62, 11]], [[...",5222727398981632,True,2017-03-26 18:54:05.597120,fence
1,US,"[[[2, 0, 3, 12, 52, 76, 76, 71], [154, 124, 31...",6465072190717952,True,2017-03-23 18:40:23.744720,fence
2,JP,"[[[79, 59, 38, 31, 38, 62, 100, 138, 155, 158,...",4842254327873536,True,2017-01-03 12:13:44.539970,yoga
3,DE,"[[[255, 90, 4], [141, 130, 128]], [[254, 252, ...",5733923290087424,True,2017-04-02 16:54:23.439260,yoga


## Data generators from disk (no need to store in RAM)

In [5]:
class_labels = list(map(lambda x: x.replace(".csv", ""), zf.namelist()))

class_labels[:5]

['fence', 'yoga', 'horse', 'sandwich', 'cat']

In [0]:
# loop through file eternally
def get_eternal_csv_generator(fn, debug=False):
  while True:
    with io.TextIOWrapper(zf.open(fn)) as f:
      f.readline()  # skip header
      for line in csv.reader(f, delimiter=',', quotechar='"'):
        yield line[1], line[5]
        if debug:
          print(fn, "is done, starting from the beginning...")

In [0]:
# test
gen = get_eternal_csv_generator("yoga.csv", debug=True)
for line in tqdm.tqdm(gen):
    pass

In [0]:
def raw_batch_generator(batch_size, debug=False):
    generators = np.array([get_eternal_csv_generator(fn, debug) for fn in zf.namelist()])
    while True:
        random_indices = np.random.randint(0, len(generators), size=batch_size)
        if debug:
          print("Random_indices = {} \n".format(random_indices))
          print()
          print("Random_generators = {} \n".format(generators[random_indices]))
          print()
        yield [next(gen) for gen in generators[random_indices]]

In [0]:
# test
for batch in tqdm.tqdm(raw_batch_generator(batch_size=32, debug=False)):
    if np.random.random() < 0.0001:
        print([e[1] for e in batch])

##  Images generator

In [0]:
# copy-paste from https://www.kaggle.com/jpmiller/image-based-cnn

IMG_SIZE = 224

# faster conversion function
def draw_it(strokes, img_size=224):
    image = Image.new("P", (256, 256), color=255)
    image_draw = ImageDraw.Draw(image)
    for stroke in ast.literal_eval(strokes):
        for i in range(len(stroke[0])-1):
            image_draw.line([stroke[0][i], 
                             stroke[1][i],
                             stroke[0][i+1], 
                             stroke[1][i+1]],
                            fill=0, width=5)
    image = image.resize((img_size, img_size))
    return np.array(image, dtype=np.uint8)

In [0]:
def images_and_labels_generator(batch_size, img_size=224):
    for batch in raw_batch_generator(batch_size):
        batch_images = []
        batch_labels = []
        for e in batch:
            batch_images.append(draw_it(e[0], img_size))
            batch_labels.append(e[1])
        batch_images = np.stack(batch_images, axis=0)
        yield batch_images, batch_labels

In [0]:
# test
for batch in tqdm.tqdm(images_and_labels_generator(batch_size=32, img_size=IMG_SIZE)):
    if np.random.random() < 0.01:
        plt.imshow(batch[0][10, :, :])
        plt.title(batch[1][10])
        plt.show()