# This notebook demonstrates the codes used for preparing the dataset from Berkeley Deep Drive Master Dataset. 

In [None]:
import numpy as np 
from PIL import Image
import cv2
from glob import glob
import matplotlib.pyplot as plt
%matplotlib inline
import json
import pandas as pd
from tqdm import tqdm_notebook

In [None]:
import os
os.getcwd()

In [None]:
# File path for training data
file_path = glob('/Volumes/Part 1/UC Berkeley/Capstone Project/Datasets/Berkeley Deep Drive/bdd100k/images/100k/train/*')

len(file_path)

In [None]:
# Opeining image from the train image data
img = Image.open(file_path[45])

In [None]:
img

In [None]:
np.array(img).shape

In [None]:
# Number of classes
classes =['person','rider','bike','bus','car','motor','truck','train']

### <center>Bounding Box parameters schematic diagram:</center>
![](bbox.jpg)

In [None]:
# Creating a dictionary to store image file name, class in image, bounding box coordinates
labels_csv = dict()
labels_csv['file'] = []
labels_csv['x_min'] = []
labels_csv['y_min'] = []
labels_csv['x_max'] = []
labels_csv['y_max'] = []
labels_csv['class'] = []

In [None]:
# function to save bbox values for each object

def extract_labels(file_name, obj):
    """
    Store the labels to a csv file
    """
    category = obj['category']
    if category in classes:
        bbox = obj['box2d']
        labels_csv['file'].append(file_name)
        labels_csv['x_min'].append(bbox['x1'])
        labels_csv['y_min'].append(bbox['y1'])
        labels_csv['x_max'].append(bbox['x2'])
        labels_csv['y_max'].append(bbox['y2'])
        labels_csv['class'].append(category)
    return category

In [None]:
def convert_labels(label_path):
    # Extracting labels from json file and storing it as an csv file
    print('Conversion Started.......')
    
    with open(label_path) as f:
        json_file = json.load(f)
        for i in json_file:
            fn = i['name']
            for obj in i['labels']:
                cat = extract_labels(fn, obj)
    df = pd.DataFrame(labels_csv)
    df.to_csv('/Volumes/Part 1/UC Berkeley/Capstone Project/Code/labels_new.csv', index=False)
    print('Conversion done!!!!!!!!!')

In [None]:
jlabels = '/Volumes/Part 1/UC Berkeley/Capstone Project/Datasets/Berkeley Deep Drive/labels/bdd100k_labels_images_train.json'

In [None]:
convert_labels(jlabels)

In [None]:
# inspecting the dataframe for the annotations 
labels = pd.read_csv('/Volumes/Part 1/UC Berkeley/Capstone Project/Code/labels_new.csv')
labels.head()

In [None]:
labels.groupby(['class']).count()

In [None]:
labels.groupby(['file','class']).count().head(10)

In [None]:
class_names = ('person','rider','bike','bus','car','motor','truck','train')

cls = {'person':1,
       'rider':2,
       'bike':3,
       'bus':4,
       'car':5,
       'motor':6,
       'truck':7,
       'train':8}

In [None]:
labels['class'] = labels['class'].map(cls)

In [None]:
labels.head()

In [None]:
lb_cols = list(labels.columns)
lb_cols.remove('file')

In [None]:
# take the filenames that are same
fn = labels['file'].unique()

In [None]:
lb_cols

In [None]:
len(fn)

In [None]:
# Preprocess the box coordinates for first 1000 images stored in the 100K train dataset.
boxes = []
for i in range(fn[:1000].shape[0]):
    boxes.append(labels[labels['file'] == fn[i]][lb_cols].values)

In [None]:
len(boxes)

In [None]:
# bounding box parameters for an image having 6 objects
boxes[0]

In [None]:
# bounding box parameters for an image having 30+ objects
boxes[3]

In [None]:
# save the boxes and filenames as numpy arrays for later use
np.save('/Volumes/Part 1/UC Berkeley/Capstone Project/Code/boxes_new.npy', boxes)
np.save('/Volumes/Part 1/UC Berkeley/Capstone Project/Code/fns.npy', fns[:1000])