## Now we need to download and prepare the dataset for Darknet YoloV2

### 1.0) Download the dataset (Openlogo)

In [None]:
import os
if not os.path.isdir('openlogo'):
    !aws s3 cp s3://spock.cloud/datasets/logo/openlogo.tar .
    !tar -xf openlogo.tar
    !rm -f openlogo.tar

### 1.1) Now, let's convert the dataset annotation to the Darknet format

For object detection, Darknet expects that you create a .txt file for each image with the following content:  
**class_id normalized_bbox_x normalized_bbox_y normalized_bbox_width normalized_bbox_height**  
Ex.  
261 0.563636 0.441270 0.154545 0.180952

After that, we need to create three more files:
  - **train.txt** and **test.txt**: contains a list of relative paths for all the images
  - **label.names**: list (one name per line) of all the class names

In [None]:
!mkdir -p input/data input/assets

In [None]:
from shutil import copyfile

def process_images(X, y, target):
    for sample,class_id in zip(X,y):
        filename,xmin,xmax,ymin,ymax = sample
        path = os.path.join('openlogo', 'JPEGImages', filename)
        copyfile(path, os.path.join('input', 'data', target, filename))
        with open(os.path.join('input', 'data', target, filename.replace('jpg', 'txt') ), 'w') as f:
            f.write("%d %f %f %f %f\n" % (class_id, xmin, xmax, ymin, ymax))

In [None]:
%%time
import glob
from lxml import etree
from sklearn.model_selection import train_test_split

classes = {}
X = []
y = []
annotations = glob.glob("openlogo/Annotations/*.xml")
for i,f in enumerate(annotations):
    annotation = etree.ElementTree()
    annotation.parse(f)
    
    #for annotation in tree.xpath('//annotation'):
    try:
        filename = annotation.xpath('//filename/text()')[0].strip()
        name = annotation.xpath('//object/name/text()')[0].strip()
        width = int(annotation.xpath('//width/text()')[0].strip())
        height = int(annotation.xpath('//height/text()')[0].strip())
        depth = int(annotation.xpath('//depth/text()')[0].strip())
        xmin = int(annotation.xpath('//xmin/text()')[0].strip())
        ymin = int(annotation.xpath('//ymin/text()')[0].strip())
        xmax = int(annotation.xpath('//xmax/text()')[0].strip())
        ymax = int(annotation.xpath('//ymax/text()')[0].strip())
        
        path = os.path.join('openlogo', 'JPEGImages', filename)
        if not os.path.isfile(path ):
            raise Exception( "%s is not a valid file: " % path)
        
        class_id = classes[name] = len(classes) if classes.get(name) is None else classes[name]
        
        X.append([filename, xmin/width, ymin/height, (xmax-xmin)/width, (ymax-ymin)/height ])            
        y.append(class_id)
    except Exception as e:
        print(e)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
%%time
process_images(X_train, y_train, 'training')

In [None]:
%%time
process_images(X_test, y_test, 'testing')

### 1.2) Now, we can create the labels file and the descriptors for training/testing

In [None]:
with open('input/assets/label.names', 'w') as l:
    [l.write("%s\n" % k) for k in sorted(classes, key=classes.get)]

!find input/data/training -name *.jpg -printf '%P\n' > input/data/training/train.txt
!find input/data/testing -name *.jpg -printf '%P\n' > input/data/testing/test.txt
!wc -l input/data/training/train.txt
!wc -l input/data/testing/test.txt
!wc -l input/assets/label.names

## 2.0) Upload the dataset to S3
Here we'll use **sync** given it uses parallel uploads.  
**CHANGE the variable 'bucket' to another bucket, if you wish**

In [None]:
import sagemaker

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = 'logo'
print(bucket)

In [None]:
!aws s3 sync --quiet input/data/training s3://$bucket/$prefix/input/training

In [None]:
!aws s3 sync --quiet input/data/testing s3://$bucket/$prefix/input/testing