# Data Collection & Wrangling

---

In [None]:
# Module check/download

try:
    from pip import main as pip_main
except:
    from pip._internal import main as pip_main

def import_or_install(package):
    
    i = package[0]
    
    try:
        __import__(package[0])
        print(i+": "+"ok")
        
    except ImportError:
        print("\n"+i+": "+"installing...")
        pip_main(['install', package[1]])
        print(i+": "+"ok")

modules = [('glob','glob'), ('matplotlib','matplotlib'), 
           ('natsort','natsort'), ('numpy','numpy'),
           ('cv2','opencv-python')]

for i in modules:
    import_or_install(i)

In [None]:
# Relevant libraries

import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import numpy as np
import zipfile
import natsort
import urllib
import glob
import json
import cv2
import os

In [None]:
# Data retrieval (Desktop will be used as workspace)

def get_data():

    path = "https://github.com/luisra/waldo/raw/master/data_collection/sources/"
    files = ["WeaponS.zip", "WeaponF.zip", "WeaponS_bbox.zip", "train2014.zip", "val2014.zip", "annotations_trainval2014.zip"]
    labels = ['gun images', 'more gun images', 'gun bounding boxes', 'knife images', 'more knife images', 'knife bounding boxes']
    
    for i in range(len(files)):
        
        url = path+files[i]
        
        print ("\ndownloading {}...".format(labels[i]))    
        filename, headers = urllib.request.urlretrieve(
            url,filename=os.path.expanduser(str("~/Desktop/{}").format(files[i])))    
        print ("download complete!")
        
        os.chdir(os.path.expanduser("~/Desktop/"))
        print ("\nextracting {}...".format(labels[i]))
        zipf = zipfile.ZipFile(os.path.expanduser(str("~/Desktop/{}").format(files[i])), "r")
        zipf.extractall()
        zipf.close()
        print ("extract complete!")
    
get_data()

## Guns

In [None]:
# Data prep for RetinaNet

files = glob.glob(os.path.expanduser("~/Desktop/WeaponS/*.jpg"))
files = natsort.natsorted(files)

annotations = glob.glob(os.path.expanduser("~/Desktop/WeaponS_bbox/*.xml"))
annotations = natsort.natsorted(annotations)

### https://stackoverflow.com/questions/53317592/reading-pascal-voc-annotations-in-python ###
def read_content(xml_file: str):

    tree = ET.parse(xml_file)
    root = tree.getroot()
    list_with_all_boxes = []

    for boxes in root.iter('object'):

        filename = root.find('filename').text
        ymin, xmin, ymax, xmax = None, None, None, None

        for box in boxes.findall("bndbox"):
            ymin = int(box.find("ymin").text)
            xmin = int(box.find("xmin").text)
            ymax = int(box.find("ymax").text)
            xmax = int(box.find("xmax").text)

        list_with_single_boxes = [xmin, ymin, xmax, ymax]
        list_with_all_boxes.append(list_with_single_boxes)

    return filename, list_with_all_boxes
###

store = []

for k in range(len(files)):
    
    name, boxes = read_content(annotations[k])
        
    if len(boxes) == 1:
        
        rec = [files[k]]
        rec.extend(boxes[0])
        rec.append('gun')
        store.append(rec)
            
    else:
        for c in range(len(boxes)):
            
            rec = [files[k]]
            rec.extend(boxes[c])
            rec.append('gun')
            store.append(rec)

In [None]:
# Data augmentation

### https://www.kdnuggets.com/2018/09/data-augmentation-bounding-boxes-image-transforms.html/2 ###
def HorizontalFlip(img, bboxes):
    
        img_center = np.array(img.shape[:2])[::-1]/2
        img_center = np.hstack((img_center, img_center))
        
        bboxes[[0,2]] += 2*(img_center[[0,2]].astype(np.int32) - bboxes[[0,2]].astype(np.int32))        
        box_w = abs(bboxes[0] - bboxes[2])
         
        bboxes[0] -= box_w
        bboxes[2] += box_w
            
        return bboxes
###

storef = []

for k in range(len(store)):
    
    img = cv2.imread(store[k][0],1)
    bboxes = np.array([store[k][1], store[k][2], store[k][3], store[k][4]])
    
    bboxes = HorizontalFlip(img, bboxes)
    storef.append([store[k][0], bboxes[0], bboxes[1], bboxes[2], bboxes[3], 'gun'])
    
for rec in storef:
    rec[0] = rec[0].replace('WeaponS', 'WeaponF')
    
store.extend(storef)

In [None]:
# Sanity check

img = cv2.imread(store[0][0],1)
cv2.rectangle(img,(store[0][1], store[0][2]), (store[0][3], store[0][4]),(0,255,0),2)
cv2.imshow('image',img)
cv2.waitKey(0)

In [None]:
# Save in RetinaNet format

file = open(os.path.expanduser("~/Desktop/pistolData.txt"),'w')

for rec in store:
    file.write(", ".join(str(x) for x in rec)+'\n')
    
file.close()

## Knives

In [None]:
# Data prep for RetinaNet

dataDir = os.path.expanduser("~/Desktop")
dataTypes = ['train2014', 'val2014']
output = []

for dataType in dataTypes:
    
    store = []

    for jpgfile in glob.iglob(os.path.join(os.path.expanduser(str("{}/{}").format(dataDir,dataType)), "*.jpg")):
        store.append(jpgfile[-16:-4])

    store_int = list(map(int,store))
    annFile='{}/annotations/instances_{}.json'.format(dataDir,dataType)

    f =  open(annFile)
    d = json.load(f)
    f.close()

    annotations = d['annotations']
    d = {}

    for k in range(len(annotations)):

        img_id = annotations[k]['image_id']
        cat_id = annotations[k]['category_id']

        if img_id in store_int and cat_id == 49:

            if img_id in d:
                d[img_id].append(annotations[k]['bbox'])

            else:
                d[img_id] = []
                d[img_id].append(annotations[k]['bbox'])

    path = os.path.expanduser(str("{}/{}/COCO_{}_").format(dataDir, dataType, dataType))

    for k in range(len(store_int)):

        terms = len(d[store_int[k]])

        if terms > 1:

            for i in range(terms):

                file = path + store[k] + '.jpg'
                bbox = d[store_int[k]][i]

                xmin = int(bbox[0])
                xmax = int(bbox[0] + bbox[2])
                ymax = int(bbox[1] + bbox[3])
                ymin = int(bbox[1])

                output.append([file, xmin, ymin, xmax, ymax, 'knife'])

        else:
            file = path + store[k] + '.jpg'
            bbox = d[store_int[k]][0]

            xmin = int(bbox[0])
            xmax = int(bbox[0] + bbox[2])
            ymax = int(bbox[1] + bbox[3])
            ymin = int(bbox[1])

            output.append([file, xmin, ymin, xmax, ymax, 'knife'])

In [None]:
# Sanity check

img = cv2.imread(output[0][0],1)
cv2.rectangle(img,(output[0][1], output[0][2]), (output[0][3], output[0][4]), (0,255,0), 2)
cv2.imshow('image',img)
cv2.waitKey(0)

In [None]:
# Save in RetinaNet format

file = open(os.path.expanduser("~/Desktop/knifeData.txt"),'w')

for rec in output:
    file.write(", ".join(str(x) for x in rec)+'\n')
    
file.close()