** CNTK 210: Part A - COCO Data Loader **  
In this tutorial, we will download the COCO Image dataset to be used for image captioning.

The CNTK 210 tutorial is divided into 2 parts:  
Part A: downloads the images and annotations and then builds CNTK .map and .ctf reader files.
Part B: builds a Sequence-to-sequence model that we then train and test, using the downloaded images and annotations (captions).

To keep this tutorial relatively fast running, we only use a small subset of the COCO dataset - the 2017 validation set, which consists of 5000 images and 24,000 annotations (captions).

In [1]:
import json
import os
import zipfile
import urllib.request

def unzip_files(url, to_dir, fn_to_test, zipfn):
    
    if (not os.path.exists(fn_to_test)):

        if (not os.path.exists(zipfn)):
            print("downloading zip file from", url + " ...")
            urllib.request.urlretrieve(url, zipfn)

        print("unzipping files from", zipfn + " ...")
    
        if (not os.path.exists(to_dir)):
            os.mkdir(to_dir)
            
        with zipfile.ZipFile(zipfn, "r") as zipper:
            zipper.extractall(to_dir)
        
        os.remove(zipfn)
        print("unzip completed")
        
# download & unzip image files if not present (into "images" subdir)
unzip_files("http://images.cocodataset.org/zips/val2017.zip", "images", "images\\val2017\\000000000139.jpg", "images.zip")

# download & unzip annotation files if not present (into "annotations" subdir)
unzip_files("http://images.cocodataset.org/annotations/annotations_trainval2017.zip", "annotations", 
            "annotations\\annotations\\captions_val2017.json", "annotations.zip")

downloading zip file from http://images.cocodataset.org/zips/val2017.zip ...
unzipping files from images.zip ...
unzip completed
downloading zip file from http://images.cocodataset.org/annotations/annotations_trainval2017.zip ...
unzipping files from annotations.zip ...
unzip completed


In [4]:
def process_annotations():
    
    with open('annotations\\annotations\\captions_val2017.json') as json_text:
        d = json.load(json_text)

    print("Processing annotation data...")
    
    images = d["images"]
    annotations = d["annotations"]

    # we need to access image data by the image id
    image_dict = {}
    for i in images:
        id = i["id"]
        image_dict[id] = i["file_name"]

    print("len(images)=", len(images), "len(annotations)=", len(annotations))
    entries = []
    captions = []

    for a in annotations:
        strcap = a["caption"].lower().strip()
        before = strcap

        if (strcap.endswith(".") and not strcap.endswith("..")):
            strcap = strcap[:-1]

        #if ("." in strcap):
        #    print("before strcap=", before, "found_period=", found_period)

        caption = strcap.split()
        captions = captions + caption

        last_word = caption[-1]
        fn = image_dict[a["image_id"]]

        entry = {"imageid": a["image_id"], "label": last_word, "fn": fn, "caption": caption}
        entries.append(entry)

    # build dictionary of all words used in captions
    captions = captions + ["<s>", "</s>"]   # add strings for sequence start and sequence end
    ucaptions = set(captions)
    udict = {}
    next = 0
    for u in ucaptions:
        udict[u] = next
        next += 1

    print("len(captions)=", len(captions), "len(ucaptions)=", len(ucaptions))

    # shuffle entries so we can split into train and test
    import random
    random.shuffle(entries)

    # split the data: 80% for training, and 20% for test
    train_count = int(.8 * len(entries))
    train = entries[:train_count]
    test = entries[train_count:]
    
    print("annotation processing completed")
    return [train, test, udict]

# read the annotations file and build the "train" and "test" data, along with a "udict" dictionary
[train, test, udict] = process_annotations()

Processing annotation data...
len(images)= 5000 len(annotations)= 25014
len(captions)= 261108 len(ucaptions)= 8222
annotation processing completed


In [6]:
def write_dict(data, dir, fn):
    import pickle
    
    if (not os.path.exists(dir)):
        os.mkdir(dir)
    fn = dir + "\\" + fn
    
    with open(fn, "wb") as myfile:
        pickle.dump(data, myfile)

        print(len(data), "dict entries written to fn=", fn)
        
def write_files(data, udict, dir, fnbase):
    
    if (not os.path.exists(dir)):
        os.mkdir(dir)

    # write MAP file
    fn = dir + "\\" + fnbase + ".map"
    with open(fn, "w") as text_file:
        seqnum = 0
        for e in data:
            label_id = udict[e["label"]]
            line = "images\\" + e["fn"] + "\t" + str(label_id) + "\t" + str(seqnum) + "\r\n"
            text_file.write(line)
            seqnum += 1

        print(seqnum, "lines written to fn=", fn)
        
    # write CTF file
    fn = dir + "\\" + fnbase + ".ctf"
    with open(fn, "w") as text_file:
        seqnum = 0
        for e in data:
            caption = e["caption"]
            caption = ["<s>"] + caption + ["</s>"]
            for c in caption:
                word_id = udict[c]
                line = str(seqnum) + "\t| label " + str(word_id) + ":1\t| # " + c + "\r\n"
                text_file.write(line)
            seqnum += 1

        print(seqnum, "lines written to fn=", fn)
        
# write the dictionary file (to the "data" subdir)
write_dict(udict, "data", "mc.dict")

# write the train and test files (to the "data" subdir)
write_files(train, udict, "data", "mc_train")
write_files(test, udict, "data", "mc_test")

print("Data loading tutorial completed.")

8222 dict entries written to fn= data\mc.dict
20011 lines written to fn= data\mc_train.map
20011 lines written to fn= data\mc_train.ctf
5003 lines written to fn= data\mc_test.map
5003 lines written to fn= data\mc_test.ctf
Data loading tutorial completed.
