In [3]:
import cv2
import matplotlib.pyplot as plt

# import sys
# sys.path.append('/darknet/python/')
# import darknet as dn

In [4]:
from ctypes import *
import math
import random

def sample(probs):
    s = sum(probs)
    probs = [a/s for a in probs]
    r = random.uniform(0, 1)
    for i in range(len(probs)):
        r = r - probs[i]
        if r <= 0:
            return i
    return len(probs)-1

def c_array(ctype, values):
    arr = (ctype*len(values))()
    arr[:] = values
    return arr

class BOX(Structure):
    _fields_ = [("x", c_float),
                ("y", c_float),
                ("w", c_float),
                ("h", c_float)]

class DETECTION(Structure):
    _fields_ = [("bbox", BOX),
                ("classes", c_int),
                ("prob", POINTER(c_float)),
                ("mask", POINTER(c_float)),
                ("objectness", c_float),
                ("sort_class", c_int)]


class IMAGE(Structure):
    _fields_ = [("w", c_int),
                ("h", c_int),
                ("c", c_int),
                ("data", POINTER(c_float))]

class METADATA(Structure):
    _fields_ = [("classes", c_int),
                ("names", POINTER(c_char_p))]



#lib = CDLL("/home/pjreddie/documents/darknet/libdarknet.so", RTLD_GLOBAL)
lib = CDLL("/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/darknet/libdarknet.so", RTLD_GLOBAL)
lib.network_width.argtypes = [c_void_p]
lib.network_width.restype = c_int
lib.network_height.argtypes = [c_void_p]
lib.network_height.restype = c_int

predict = lib.network_predict
predict.argtypes = [c_void_p, POINTER(c_float)]
predict.restype = POINTER(c_float)

set_gpu = lib.cuda_set_device
set_gpu.argtypes = [c_int]

make_image = lib.make_image
make_image.argtypes = [c_int, c_int, c_int]
make_image.restype = IMAGE

get_network_boxes = lib.get_network_boxes
get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int)]
get_network_boxes.restype = POINTER(DETECTION)

make_network_boxes = lib.make_network_boxes
make_network_boxes.argtypes = [c_void_p]
make_network_boxes.restype = POINTER(DETECTION)

free_detections = lib.free_detections
free_detections.argtypes = [POINTER(DETECTION), c_int]

free_ptrs = lib.free_ptrs
free_ptrs.argtypes = [POINTER(c_void_p), c_int]

network_predict = lib.network_predict
network_predict.argtypes = [c_void_p, POINTER(c_float)]

reset_rnn = lib.reset_rnn
reset_rnn.argtypes = [c_void_p]

load_net = lib.load_network
load_net.argtypes = [c_char_p, c_char_p, c_int]
load_net.restype = c_void_p

do_nms_obj = lib.do_nms_obj
do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]

do_nms_sort = lib.do_nms_sort
do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]

free_image = lib.free_image
free_image.argtypes = [IMAGE]

letterbox_image = lib.letterbox_image
letterbox_image.argtypes = [IMAGE, c_int, c_int]
letterbox_image.restype = IMAGE

load_meta = lib.get_metadata
lib.get_metadata.argtypes = [c_char_p]
lib.get_metadata.restype = METADATA

load_image = lib.load_image_color
load_image.argtypes = [c_char_p, c_int, c_int]
load_image.restype = IMAGE

rgbgr_image = lib.rgbgr_image
rgbgr_image.argtypes = [IMAGE]

predict_image = lib.network_predict_image
predict_image.argtypes = [c_void_p, IMAGE]
predict_image.restype = POINTER(c_float)

def classify(net, meta, im):
    out = predict_image(net, im)
    res = []
    for i in range(meta.classes):
        res.append((meta.names[i], out[i]))
    res = sorted(res, key=lambda x: -x[1])
    return res

def detect(net, meta, image, thresh=.05, hier_thresh=.05, nms=.45):
    im = load_image(image, 0, 0)
    num = c_int(0)
    pnum = pointer(num)
    predict_image(net, im)
    dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
    num = pnum[0]
    if (nms): do_nms_obj(dets, num, meta.classes, nms);

    res = []
    for j in range(num):
        for i in range(meta.classes):
            if dets[j].prob[i] > 0:
                b = dets[j].bbox
                res.append([meta.names[i], dets[j].prob[i], (b.x, b.y, b.w, b.h)])
    res = sorted(res, key=lambda x: -x[1])
    free_image(im)
    free_detections(dets, num)
    return res


In [5]:
import xml.etree.ElementTree as ET
import pickle
import os
from os import listdir, getcwd
from os.path import join

sets=[('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test')]
classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]

def convert(size, box):
    dw = 1./(size[0])
    dh = 1./(size[1])
    x = (box[0] + box[1])/2.0 - 1
    y = (box[2] + box[3])/2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)

## Function to get the ground truths. This function is same as the convert_annotations() as given in voc_labels.py in the darknet/scripts/
def ground_truth(year, image_id):
    in_file = open('/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/VOC/VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id))
    out_file = open('/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/VOC/VOCdevkit/VOC%s/labels/%s.txt'%(year, image_id), 'w')
    tree=ET.parse(in_file)
    root = tree.getroot()
    size = root.find('size')
    w = int(size.find('width').text)
    h = int(size.find('height').text)

    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
        if cls not in classes or int(difficult)==1:
            continue
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
        bb = convert((w,h), b)
        return b, cls_id


In [6]:
## Function to get intersection over union (IOU)
def IOU(boxA, boxB):
    # determine the (x1,y1) and (x2,y2)-coordinates 
    bA_x1, bA_y1, bA_w, bA_h = boxA
    bB_x1, bB_y1, bB_w, bB_h = boxB
    
    bA_x1 = bA_x1
    bA_y1 = (bA_y1 + bA_h)
    bA_x2 = (bA_x1 + bA_w)
    bA_y2 = bA_y1 
    
    bB_x1 = bB_x1 
    bB_y1 = (bB_y1 + bA_h) 
    bB_x2 = (bB_x1 + bB_w) 
    bB_y2 = bB_y1 
    
    xA = max(bA_x1, bB_x1)
    yA = max(bA_y1, bB_y1)
    xB = min(bA_x2, bB_x2)
    yB = min(bA_y2, bB_y2)
    
    # compute the area of intersection rectangle
    intersection_area = max(0, xB - xA + 1) * max(0, yA - yB + 1)
#     print("InterArea: ",interArea)
    
    # compute the area of both the prediction and ground-truth
    bA_Area = (bA_x2 - bA_x1 + 1) * (bA_y1 - bA_y2 + 1)
    bB_Area = (bB_x2 - bB_x1 + 1) * (bB_y1 - bB_y2 + 1)
    
    # compute the intersection over union 
#     print("Denom: ", abs(float(boxAArea + boxBArea - interArea)))
    iou = (intersection_area / abs(float(bA_Area + bB_Area - intersection_area))) - 1.0

    return iou

In [7]:
## Function to calculate the Mean Average Precision (MAP)

conf_dict0 = {'TP': 0, 'FP': 0}
conf_dict1 = {'TP': 0, 'FP': 0}
conf_dict2 = {'TP': 0, 'FP': 0}
conf_dict3 = {'TP': 0, 'FP': 0}
conf_dict4 = {'TP': 0, 'FP': 0}

conf_dict5 = {'TP': 0, 'FP': 0}
conf_dict6 = {'TP': 0, 'FP': 0}
conf_dict7 = {'TP': 0, 'FP': 0}
conf_dict8 = {'TP': 0, 'FP': 0}
conf_dict9 = {'TP': 0, 'FP': 0}

conf_dict10 = {'TP': 0, 'FP': 0}
conf_dict11 = {'TP': 0, 'FP': 0}
conf_dict12 = {'TP': 0, 'FP': 0}
conf_dict13 = {'TP': 0, 'FP': 0}
conf_dict14 = {'TP': 0, 'FP': 0}

conf_dict15 = {'TP': 0, 'FP': 0}
conf_dict16 = {'TP': 0, 'FP': 0}
conf_dict17 = {'TP': 0, 'FP': 0}
conf_dict18 = {'TP': 0, 'FP': 0}
conf_dict19 = {'TP': 0, 'FP': 0}


def MAP(classval, iouval):
    if classval == 0 and iouval > 0.5:
        conf_dict0['TP'] += 1
    else:
        conf_dict0['FP'] += 1

    if classval == 1 and iouval > 0.5:
        conf_dict1['TP'] += 1
    else:
        conf_dict1['FP'] += 1

    if classval == 2 and iouval > 0.5:
        conf_dict2['TP'] += 1
    else:
        conf_dict2['FP'] += 1

    if classval == 3 and iouval > 0.5:
        conf_dict3['TP'] += 1
    else:
        conf_dict3['FP'] += 1

    if classval == 4 and iouval > 0.5:
        conf_dict4['TP'] += 1
    else:
        conf_dict4['FP'] += 1

    if classval == 5 and iouval > 0.5:
        conf_dict5['TP'] += 1
    else:
        conf_dict5['FP'] += 1

    if classval == 6 and iouval > 0.5:
        conf_dict6['TP'] += 1
    else:
        conf_dict6['FP'] += 1

    if classval == 7 and iouval > 0.5:
        conf_dict7['TP'] += 1
    else:
        conf_dict7['FP'] += 1

    if classval == 8 and iouval > 0.5:
        conf_dict8['TP'] += 1
    else:
        conf_dict8['FP'] += 1

    if classval == 9 and iouval > 0.5:
        conf_dict9['TP'] += 1
    else:
        conf_dict9['FP'] += 1

    if classval == 10 and iouval > 0.5:
        conf_dict10['TP'] += 1
    else:
        conf_dict10['FP'] += 1

    if classval == 11 and iouval > 0.5:
        conf_dict11['TP'] += 1
    else:
        conf_dict11['FP'] += 1

    if classval == 12 and iouval > 0.5:
        conf_dict12['TP'] += 1
    else:
        conf_dict12['FP'] += 1

    if classval == 13 and iouval > 0.5:
        conf_dict13['TP'] += 1
    else:
        conf_dict13['FP'] += 1

    if classval == 14 and iouval > 0.5:
        conf_dict14['TP'] += 1
    else:
        conf_dict14['FP'] += 1

    if classval == 15 and iouval > 0.5:
        conf_dict15['TP'] += 1
    else:
        conf_dict15['FP'] += 1

    if classval == 16 and iouval > 0.5:
        conf_dict16['TP'] += 1
    else:
        conf_dict16['FP'] += 1

    if classval == 17 and iouval > 0.5:
        conf_dict17['TP'] += 1
    else:
        conf_dict17['FP'] += 1

    if classval == 18 and iouval > 0.5:
        conf_dict18['TP'] += 1
    else:
        conf_dict18['FP'] += 1

    if classval == 19 and iouval > 0.5:
        conf_dict19['TP'] += 1
    else:
        conf_dict19['FP'] += 1
    
def final_map():
    mapval0 = conf_dict0['TP']/(conf_dict0['TP'] + conf_dict0['FP'])
    mapval1 = conf_dict1['TP']/(conf_dict1['TP'] + conf_dict1['FP'])
    mapval2 = conf_dict2['TP']/(conf_dict2['TP'] + conf_dict2['FP'])
    mapval3 = conf_dict3['TP']/(conf_dict3['TP'] + conf_dict3['FP'])
    mapval4 = conf_dict4['TP']/(conf_dict4['TP'] + conf_dict4['FP'])
    mapval5 = conf_dict5['TP']/(conf_dict5['TP'] + conf_dict5['FP'])
    
    mapval6 = conf_dict6['TP']/(conf_dict6['TP'] + conf_dict6['FP'])
    mapval7 = conf_dict7['TP']/(conf_dict7['TP'] + conf_dict7['FP'])
    mapval8 = conf_dict8['TP']/(conf_dict8['TP'] + conf_dict8['FP'])
    mapval9 = conf_dict9['TP']/(conf_dict9['TP'] + conf_dict9['FP'])
    mapval10 = conf_dict10['TP']/(conf_dict10['TP'] + conf_dict10['FP'])
    mapval11 = conf_dict11['TP']/(conf_dict11['TP'] + conf_dict11['FP'])
 

    mapval12 = conf_dict12['TP']/(conf_dict12['TP'] + conf_dict12['FP'])
    mapval13 = conf_dict13['TP']/(conf_dict13['TP'] + conf_dict13['FP'])
    mapval14 = conf_dict14['TP']/(conf_dict14['TP'] + conf_dict14['FP'])
    mapval15 = conf_dict15['TP']/(conf_dict15['TP'] + conf_dict15['FP'])
    mapval16 = conf_dict16['TP']/(conf_dict16['TP'] + conf_dict16['FP'])
    mapval17 = conf_dict17['TP']/(conf_dict17['TP'] + conf_dict17['FP'])

    mapval18 = conf_dict18['TP']/(conf_dict18['TP'] + conf_dict18['FP'])
    mapval19 = conf_dict19['TP']/(conf_dict19['TP'] + conf_dict19['FP'])
    
    total_map = (1/20.0)*(mapval0 + mapval1 + mapval2 + mapval3 + mapval4 + mapval5 + mapval6 + mapval7 + mapval8 + mapval9 + mapval10 + mapval11 + 
          mapval12 + mapval13 + mapval14 + mapval15 + mapval16 + mapval17 + mapval18 + mapval19)
    
    print("MAP value: ",total_map)



In [8]:
## Read all the files provided in the validation set.
## I used the validation set in the Layout rather than in Main because I was computing locally than on the server. 
## For bigger val.txt file simply subsitute Layout with Main
val_files = open('/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/VOC/VOCdevkit/VOC2012/ImageSets/Layout/val.txt', 'r')

val_lines = val_files.readlines()

val_filename = []
for val_word in val_lines:
    val_first = val_word.split()[0]
    val_filename.append(val_first)

## Total no. of files
print(len(val_filename))

425


In [None]:
import os 
import time

val_jpg = os.listdir("/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/VOC/VOCdevkit/VOC2012/JPEGImages")   # imagine you're one directory above test dir

time_yolo = 0

for fname in val_filename:
    
    for jpg_word in val_jpg:
        jpg_fname = jpg_word.split('.')[0]
        if fname == jpg_fname:
            time1 = time.time()
            ## This is where the yolo and tiny yolo config and wieghts can be added to run the 2 models
            net = load_net(b"/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/darknet/cfg/yolov2-tiny-voc.cfg",
            b"/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/darknet/cfg/yolov2-tiny-voc.weights", 0)
            meta = load_meta(b'/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/darknet/cfg/voc.data')
            fname_jpg = fname + '.jpg'
            fname_bytes = str.encode(fname_jpg)
            ## Using the name from the validation text to find those images in the JPEG folder.
            res = detect(net, meta, b'/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/VOC/VOCdevkit/VOC2012/JPEGImages/'+fname_bytes)
            
            print(len(res))
            if res:
                for i in range(len(res)):
                    time2 = time.time() 

                    tot_time = time2 - time1
                    time_yolo = time_yolo + tot_time
                    bbox_pred = res[i][2]
                    classname = res[i][0]
                    classname = classname.decode()
                    classval = classes.index(classname)
                    print("Prediction Results: ", res)

                    year = '2012'
                    bbox_ground, cval = ground_truth(year, fname)
                    print("Bounding Box spec: ", bbox_ground)
                    print("Class: ", classval)

                    iouval = IOU(bbox_pred, bbox_ground)
                    print("IOU value: ",iouval)

                    MAP(classval, iouval)
                    print("Time for this prediction: ", tot_time)
                    print("\n")
final_map()                
print("Total time for all the predictions: ",time_yolo)

4
Prediction Results:  [[b'person', 0.3448275327682495, (266.8377685546875, 207.32054138183594, 51.50077438354492, 154.1045684814453)], [b'aeroplane', 0.27170172333717346, (211.26304626464844, 152.7179718017578, 495.48394775390625, 424.8694152832031)], [b'person', 0.09255173802375793, (261.11590576171875, 229.721435546875, 27.625225067138672, 134.03323364257812)], [b'tvmonitor', 0.05083267763257027, (359.6932067871094, 192.49838256835938, 279.7826843261719, 363.6111755371094)]]
Bounding Box spec:  (229.0, 318.0, 99.0, 311.0)
Class:  14
IOU value:  0.026704124646061755
Time for this prediction:  1.069962978363037


Prediction Results:  [[b'person', 0.3448275327682495, (266.8377685546875, 207.32054138183594, 51.50077438354492, 154.1045684814453)], [b'aeroplane', 0.27170172333717346, (211.26304626464844, 152.7179718017578, 495.48394775390625, 424.8694152832031)], [b'person', 0.09255173802375793, (261.11590576171875, 229.721435546875, 27.625225067138672, 134.03323364257812)], [b'tvmonitor'

In [8]:
## MAP and Execution time for tiny YOLO
final_map()                
print("Total time for all the predictions: ",time_yolo)

MAP value:  0.0013808521830615467
Total time for all the predictions:  4220.852967739105


In [None]:
import os 
import time

val_jpg = os.listdir("/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/VOC/VOCdevkit/VOC2012/JPEGImages")   # imagine you're one directory above test dir

time_yolo = 0

for fname in val_filename:
    
    for jpg_word in val_jpg:
        jpg_fname = jpg_word.split('.')[0]
        if fname == jpg_fname:
            time1 = time.time()
            ## This is where the yolo and tiny yolo config and wieghts can be added to run the 2 models
            net = load_net(b"/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/darknet/cfg/yolov2-voc.cfg",
            b"/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/darknet/cfg/yolov2-voc.weights", 0)
            meta = load_meta(b'/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/darknet/cfg/voc.data')
            fname_jpg = fname + '.jpg'
            fname_bytes = str.encode(fname_jpg)
            ## Using the name from the validation text to find those images in the JPEG folder.
            res = detect(net, meta, b'/Users/shrey/Downloads/EmoryUniversity/Semester4/ImageAnalysis/Codes/Assignment5/VOC/VOCdevkit/VOC2012/JPEGImages/'+fname_bytes)
            
            print(len(res))
            if res:
                for i in range(len(res)):
                    time2 = time.time() 

                    tot_time = time2 - time1
                    time_yolo = time_yolo + tot_time
                    bbox_pred = res[i][2]
                    classname = res[i][0]
                    classname = classname.decode()
                    classval = classes.index(classname)
                    print("Prediction Results: ", res)

                    year = '2012'
                    bbox_ground, cval = ground_truth(year, fname)
                    print("Bounding Box spec: ", bbox_ground)
                    print("Class: ", classval)

                    iouval = IOU(bbox_pred, bbox_ground)
                    print("IOU value: ",iouval)

                    MAP(classval, iouval)
                    print("Time for this prediction: ", tot_time)
                    print("\n")
final_map()                
print("Total time for all the predictions: ",time_yolo)

1
Prediction Results:  [[b'person', 0.681635320186615, (273.5574645996094, 221.5738525390625, 60.34749984741211, 175.97703552246094)]]
Bounding Box spec:  (229.0, 318.0, 99.0, 311.0)
Class:  14
IOU value:  0.030790302260035096
Time for this prediction:  4.325082063674927


1
Prediction Results:  [[b'person', 0.9280586242675781, (188.58665466308594, 315.8254089355469, 249.35763549804688, 375.7948913574219)]]
Bounding Box spec:  (54.0, 331.0, 151.0, 500.0)
Class:  14
IOU value:  1.3334270900436902
Time for this prediction:  4.280951976776123


1
Prediction Results:  [[b'person', 0.9280586242675781, (188.58665466308594, 315.8254089355469, 249.35763549804688, 375.7948913574219)]]
Bounding Box spec:  (54.0, 331.0, 151.0, 500.0)
Class:  14
IOU value:  1.3334270900436902
Time for this prediction:  4.224079132080078


4
Prediction Results:  [[b'person', 0.9013368487358093, (166.82907104492188, 212.34046936035156, 93.42710876464844, 262.8658752441406)], [b'person', 0.8708795309066772, (84.86627