In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [7]:
import os
import wget
import zipfile
from pycocotools.coco import COCO
import numpy as np
import pandas as pd

In [6]:
def load_data(path=None):
    """Download the coco dataset to disk if it is not already downloaded. This will generate 2 csv files
    
    Args: 
        path (str, optional): The path to store the coco 2017 data. 
        Defaults to None, will save at `tempfile.gettempdir()`.
    
    Return:
        string: path to train csv file.
        string: path to test csv file.
        """
    if path is None:
        path = os.path.join(tempfile.gettempdir(), "coco")
    if not os.path.exists(path):
        os.makedirs(path)
        
    train_csv = os.path.join(path,'train_coco.csv')
    val_csv = os.path.join(path,'val_coco.csv')
        
    train_dir = os.path.join(path,'train2017')
    val_dir = os.path.join(path, 'val2017')
    annotations_dir = os.path.join(path, 'annotations')
        
    if not os.path.exists(os.path.join(path,'train2017.zip')):
        print('Downloading training images')
        wget.download('http://images.cocodataset.org/zips/train2017.zip', path)
    if not os.path.exists(os.path.join(path,'val2017.zip')):
        print('Downloading val images')
        wget.download('http://images.cocodataset.org/zips/val2017.zip', path)
    if not os.path.exists(os.path.join(path,'annotations_trainval2017.zip')):
        print('Downloading annotation info')
        wget.download('http://images.cocodataset.org/annotations/annotations_trainval2017.zip', path)
    
    if not os.path.exists(train_dir):
        print('Extracting training images')
        with zipfile.ZipFile(os.path.join(path,'train2017.zip'),'r') as zip_file:
            zip_file.extractall(path)
    
    if not os.path.exists(val_dir):
        print('Extracting validation images')
        with zipfile.ZipFile(os.path.join(path,'val2017.zip'),'r') as zip_file:
            zip_file.extractall(path)
            
    if not (os.path.exists(annotations_dir) and len(os.listdir(annotations_dir))!=0):
        print('Extracting annotations')
        with zipfile.ZipFile(os.path.join(path, 'annotations_trainval2017.zip'),'r') as zip_file:
            zip_file.extractall(path)
            
    # Generating train and val csv files
    train_annFile = os.path.join(annotations_dir,'instances_train2017.json')
    val_annFile = os.path.join(annotations_dir,'instances_val2017.json')
    
    generate_csv(path, train_annFile, 'train2017', train_csv)
    generate_csv(path, val_annFile, 'val2017', val_csv)
            
    return  train_csv, val_csv, path

In [5]:
classes = {}
coco_labels = {}
coco_labels_inverse = {}

def coco_labels_consecutive(coco):
    categories = coco.loadCats(coco.getCatIds())
    categories.sort(key=lambda x: x['id'])
    for c in categories:
        coco_labels[len(classes)] = c['id']
        coco_labels_inverse[c['id']] = len(classes)
        classes[c['name']] = len(classes)

def coco_dict_empty():
    coco_labels.clear()
    coco_labels_inverse.clear()
    classes.clear()

        
def category_2_category_consecutive(item):
    return [coco_labels_inverse[label] for label in item]


def format_bb(item):
    x1s = []
    y1s = []
    x2s = []
    y2s = []
    for bb in item:
        if bb[2] < 1 or bb[2] < 1:
            continue
        else:
            x1s.append(bb[0])
            y1s.append(bb[1])
            x2s.append(bb[0]+bb[2])
            y2s.append(bb[1]+bb[3])
    return [x1s, y1s, x2s, y2s]

def format_bb_xywh(item):
    xs = []
    ys = []
    ws = []
    hs = []
    for bb in item:
        if bb[2] < 1 or bb[3] < 1:
            continue
        else:
            xs.append(bb[0])
            ys.append(bb[1])
            ws.append(bb[2])
            hs.append(bb[3])
    return [xs, ys, ws, hs]

def generate_csv(path, annFile, datatype, dest_csv):
    coco=COCO(annFile)
    
    coco_dict_empty()
    coco_labels_consecutive(coco)
    
    image_ids = coco.getImgIds()
    image_paths = [ coco.loadImgs(img_ids)[0]['file_name'] for img_ids in image_ids]
    image_paths = [ os.path.join(path, datatype, img_path) for img_path in image_paths]
    annotations_ids =  [coco.getAnnIds(imgIds=img_ids, iscrowd=False) for img_ids in image_ids]
    annot_len = [len(annot_id) for annot_id in annotations_ids ]
    annot_len = np.array(annot_len)
    annot_filter = annot_len == 0  # filtering out cases where annotation is null
    annotations_details = [[ coco.loadAnns(ann_ids)[0]['bbox'] for ann_ids in ann_img] for ann_img in annotations_ids]
    annotations_details_filter =   [annot for  annot, cond in zip(annotations_details, annot_filter) if cond==False]
    
    category_details = [[ coco.loadAnns(ann_ids)[0]['category_id'] for ann_ids in ann_img] for ann_img in annotations_ids]
    category_details_filter = [label for label,cond in zip(category_details, annot_filter) if cond==False]
    
    category_consecutive_details_filter = [ category_2_category_consecutive(label) for label in category_details_filter]
    
    image_paths_filter = [image for image,cond in zip(image_paths, annot_filter) if cond==False]
    image_ids_filter = [image_id for image_id, cond in zip(image_ids, annot_filter) if cond==False]
    
    annotations_xywh = [ format_bb_xywh(item) for item in annotations_details_filter]
    
    x = [annotations[0] for annotations in  annotations_xywh]
    y = [annotations[1] for annotations in  annotations_xywh]
    w = [annotations[2] for annotations in  annotations_xywh]
    h = [annotations[3] for annotations in  annotations_xywh]
    
    row_list = []
    for img, img_id, x_item, y_item, w_item, h_item,label_item in \
                                        zip(image_paths_filter, image_ids_filter, x, y, w, h, category_consecutive_details_filter):
        row_dict = {'image':img, 'image_id': img_id, 'x':x_item, 'y': y_item, 'w': w_item, 'h': h_item, 'label': label_item}
        row_list.append(row_dict)
    df = pd.DataFrame(row_list, columns=['image','image_id', 'label', 'x', 'y', 'w', 'h'])
    df.to_csv(dest_csv, index=False)
    

### Gererate train csv and val csv from mscoco 2017 dataset.

In [8]:
train_csv, val_csv, path = load_data(path='/home/ubuntu/coco2017/MSCOCO2017/')

loading annotations into memory...
Done (t=18.23s)
creating index...
index created!
loading annotations into memory...
Done (t=0.45s)
creating index...
index created!


### Please note the categories id have been changed from not contigious 1-90 to contigious 0-80.


The above csv contains the entire, training dataset of coco2017. For the initial stages of implementatin of retinanet it would make lot of sense to go with the subset of cocc2017 dataset. The rest of the section on how to generate a accurate subset.

In [10]:
dataDir='/home/ubuntu/coco2017/MSCOCO2017/'
dataType='train2017'
# dataType='val2017'
annFile='{}/annotations/instances_{}.json'.format(dataDir,dataType)

In [11]:
annFile

'/home/ubuntu/coco2017/MSCOCO2017//annotations/instances_train2017.json'

In [12]:
coco=COCO(annFile)

loading annotations into memory...
Done (t=15.36s)
creating index...
index created!


In [13]:
cats = coco.loadCats(coco.getCatIds())
cats

[{'supercategory': 'person', 'id': 1, 'name': 'person'},
 {'supercategory': 'vehicle', 'id': 2, 'name': 'bicycle'},
 {'supercategory': 'vehicle', 'id': 3, 'name': 'car'},
 {'supercategory': 'vehicle', 'id': 4, 'name': 'motorcycle'},
 {'supercategory': 'vehicle', 'id': 5, 'name': 'airplane'},
 {'supercategory': 'vehicle', 'id': 6, 'name': 'bus'},
 {'supercategory': 'vehicle', 'id': 7, 'name': 'train'},
 {'supercategory': 'vehicle', 'id': 8, 'name': 'truck'},
 {'supercategory': 'vehicle', 'id': 9, 'name': 'boat'},
 {'supercategory': 'outdoor', 'id': 10, 'name': 'traffic light'},
 {'supercategory': 'outdoor', 'id': 11, 'name': 'fire hydrant'},
 {'supercategory': 'outdoor', 'id': 13, 'name': 'stop sign'},
 {'supercategory': 'outdoor', 'id': 14, 'name': 'parking meter'},
 {'supercategory': 'outdoor', 'id': 15, 'name': 'bench'},
 {'supercategory': 'animal', 'id': 16, 'name': 'bird'},
 {'supercategory': 'animal', 'id': 17, 'name': 'cat'},
 {'supercategory': 'animal', 'id': 18, 'name': 'dog'},

Rather than going for all the 80 categories , here we choose only 1-25 categories. We will try to filter the coco dataset for categories belonging to 1-25 only.

In [14]:
arg_set = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}

In [17]:
df = pd.read_csv('/home/ubuntu/coco2017/MSCOCO2017/train_coco.csv')
# df = pd.read_csv('/home/ubuntu/coco2017/MSCOCO2017/val_coco.csv')

In [18]:
df.head()

Unnamed: 0,image,image_id,label,x,y,w,h
0,/home/ubuntu/coco2017/MSCOCO2017/train2017/000...,391895,"[3, 0, 0, 1]","[359.17, 339.88, 471.64, 486.01]","[146.17, 22.16, 172.82, 183.31]","[112.45, 153.88, 35.92, 30.63]","[213.57, 300.73, 48.1, 34.98]"
1,/home/ubuntu/coco2017/MSCOCO2017/train2017/000...,522418,"[0, 43, 55, 71]","[382.48, 234.06, 0.0, 305.45]","[0.0, 406.61, 316.04, 172.05]","[256.8, 219.94, 406.65, 57.36]","[474.31, 42.67, 157.49, 77.3]"
2,/home/ubuntu/coco2017/MSCOCO2017/train2017/000...,184613,"[19, 19, 19, 19, 19, 25, 19, 0, 0, 0, 0, 0, 0,...","[239.9, 285.08, 452.49, 296.96, 461.07, 103.44...","[111.16, 85.99, 85.93, 68.01, 75.92, 31.01, 59...","[128.62, 170.23, 47.51, 22.39, 34.46, 154.79, ...","[99.71, 64.48, 22.82, 21.06, 14.19, 135.15, 12..."
3,/home/ubuntu/coco2017/MSCOCO2017/train2017/000...,318219,"[0, 0, 64, 64, 66, 66, 64, 62, 62, 62, 64]","[40.65, 0.0, 455.98, 405.44, 314.26, 276.83, 3...","[38.8, 0.0, 436.73, 594.41, 479.43, 241.89, 19...","[418.38, 198.92, 58.57, 76.59, 156.41, 159.37,...","[601.2, 631.35, 36.36, 40.23, 90.4, 165.01, 48..."
4,/home/ubuntu/coco2017/MSCOCO2017/train2017/000...,554625,"[62, 62, 0, 0, 0, 62, 64, 64, 66, 66, 66, 62, ...","[380.74, 339.13, 2.87, 105.2, 84.03, 359.55, 3...","[112.85, 32.99, 73.18, 36.83, 1.57, 68.79, 580...","[40.62, 32.99, 143.5, 112.06, 110.94, 46.02, 6...","[248.82, 175.5, 532.38, 227.87, 64.99, 202.79,..."


In [19]:
from ast import literal_eval
df['label_set']= df['label'].apply(lambda x: set(literal_eval(x)))

In [21]:
df['subset_select_arg'] = df['label_set'].apply(lambda x : x.issubset(arg_set))

In [22]:
df_new =  df[df['subset_select_arg']==True]

In [24]:
len(df_new)

22144

In [None]:
df_new.to_csv('train_first_26categories.csv', index=False)

In [25]:
# df_new.to_csv('val_first_26categories.csv', index=False)

### Please note the categories id have been changed from not contigious 1-90 to contigious 0-80.