[Copied from OPEN Image EDA](https://www.kaggle.com/jpmiller/open-images-eda/data)

In [None]:
! conda install -y hvplot

In [None]:
#importing Libs
import os
import glob
from pathlib import Path
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hvplot.pandas

# Images and Annotaions
excerpt-from-openimages-2020-train Data set is from another dataset, but this dataset has more number of objects per images and more classes of objects

Reading the data and shwoing few images containing boxes and segments masks along with labels.

In [None]:
data_dir = Path('../input/excerpt-from-openimages-2020-train')
im_list = sorted(data_dir.glob('train_00_part/*.jpg'))
mask_list = sorted(data_dir.glob('train-masks-f/*.png'))
boxes_df = pd.read_csv(data_dir/'oidv6-train-annotations-bbox.csv')

names_ = ['LabelName', 'Label']
labels =  pd.read_csv(data_dir/'class-descriptions-boxable.csv', names=names_)

im_ids = [im.stem for im in im_list]
cols = ['ImageID', 'LabelName', 'XMin', 'YMin', 'XMax', 'YMax']
boxes_df = boxes_df.loc[boxes_df.ImageID.isin(im_ids), cols] \
                   .merge(labels, how='left', on='LabelName')
boxes_df

Below we are using opencv to draw rectagle and text on the objects

In [None]:
# Annotate and plot
cols, rows  = 3, 2
plt.figure(figsize=(20,30))


for i,im_file in enumerate(im_list[9:15], start=1):
    df = boxes_df.query('ImageID == @im_file.stem').copy()
    img = cv2.imread(str(im_file))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Add boxes
    h0, w0 = img.shape[:2]
    coords = ['XMin', 'YMin', 'XMax', 'YMax']
    df[coords] = (df[coords].to_numpy() * np.tile([w0, h0], 2)).astype(int)

    for tup in df.itertuples():
        cv2.rectangle(img, (tup.XMin, tup.YMin), (tup.XMax, tup.YMax),
                      color=(0,255,0), thickness=2)
        cv2.putText(img, tup.Label, (tup.XMin+2, tup.YMax-2),
                    fontFace=cv2.FONT_HERSHEY_DUPLEX,
                    fontScale=1, color=(0,255,0), thickness=2)
    
    # Add segmentation masks
    mask_files = [m for m in mask_list if im_file.stem in m.stem]    
    mask_master = np.zeros_like(img)
    np.random.seed(10)
    for m in mask_files:
        mask = cv2.imread(str(m))
        mask = cv2.resize(mask, (w0,h0), interpolation = cv2.INTER_AREA)
        color = np.random.choice([0,255], size=3)
        mask[np.where((mask==[255, 255, 255]).all(axis=2))] = color
        mask_master = cv2.add(mask_master, mask)
    img = cv2.addWeighted(img,1, mask_master,0.5, 0)    
    
    plt.subplot(cols, rows, i)    
    plt.axis('off')
    plt.imshow(img)

plt.show()

# Oject Detection Demo
Reading instance-segmentation data

In [None]:
 urls = pd.read_csv(data_dir/"image_ids_and_rotation.csv", 
                   usecols=['ImageID', 'OriginalURL'])

In [None]:
classes = np.loadtxt(data_dir/"openimages.names", dtype=np.str, delimiter="\n")
net = cv2.dnn.readNet(str(data_dir/"yolov3-openimages.weights"), str(data_dir/"yolov3-openimages.cfg"))

layer_names = net.getLayerNames()
outputlayers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]

In [None]:
%%time

from skimage import io

im_url = urls.loc[urls.ImageID==im_list[11].stem, 'OriginalURL'].squeeze()
img = io.imread(im_url)

height,width,channels = img.shape

# Make a blob array and run it through the network
blob = cv2.dnn.blobFromImage(img,0.00392,(416,416),(0,0,0),True,crop=False)
net.setInput(blob)
outs = net.forward(outputlayers)

# Get confidence scores and objects
class_ids=[]
confidences=[]
boxes=[]
for out in outs:
    for detection in out:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]
        if confidence > 0.2:   # threshold
            print(confidence)
            center_x= int(detection[0]*width)
            center_y= int(detection[1]*height)
            w = int(detection[2]*width)
            h = int(detection[3]*height)
            x=int(center_x - w/2)
            y=int(center_y - h/2)
            boxes.append([x,y,w,h]) #put all rectangle areas
            confidences.append(float(confidence)) #how confidence was that object detected and show that percentage
            class_ids.append(class_id) #name of the object tha was detected
            
# Non-max suppression
indexes = cv2.dnn.NMSBoxes(boxes,confidences,0.4,0.6)
print(indexes, boxes, class_ids)

In [None]:
font = cv2.FONT_HERSHEY_DUPLEX
for i in range(len(boxes)):
#     if i in indexes:
        x,y,w,h = boxes[i]
        label = str(classes[class_ids[i]])
        cv2.rectangle(img, (x,y), (x+w,y+h), (255,255,0), 2)
        cv2.putText(img, label, (x,y+30), font, 2, (255,255,0), 2)
        
plt.clf()
plt.figure(figsize=(10,15))
plt.imshow(img)

Labels Counts per image

In [None]:
annotations = boxes_df.groupby('ImageID').agg(
                        box_count=('LabelName', 'size'),
                        box_unique=('LabelName', 'nunique')
                        )

pd.options.display.float_format = '{:,.1f}'.format
annotations.describe()

In [None]:
all = annotations.hvplot.hist('box_count', width=600, bins=30)
unique = annotations.hvplot.hist('box_unique', width=600)
(all + unique).cols(1)

In [None]:
onepct = annotations.box_count.quantile(0.99)
annotations.query('box_count < @onepct').box_count.value_counts(normalize=True) \
    .sort_index().hvplot.bar(xticks=list(range(0,60,10)), width=600,
                            line_alpha=0, xlabel='objects per image',
                            ylabel='fraction of images')

In [None]:
print(boxes_df.loc[boxes_df.ImageID=="fe7c6f7d298893da"] \
         .groupby(['ImageID', 'Label'])['LabelName'].size()
     )

im_file = "../input/excerpt-from-openimages-2020-train/train_00_part/fe7c6f7d298893da.jpg"
im = cv2.imread(im_file)
plt.imshow(im)

Reading Dataset for RVC-2020

In [None]:
from PIL import Image
from dask import bag, diagnostics


def faster_get_dims(file):
    dims = Image.open(file).size
    return dims

dfile_list = glob.glob('../input/open-images-object-detection-rvc-2020/test/*.jpg')
print(f"Getting dimensions for {len(dfile_list)} files.")

# parallelize
dfile_bag = bag.from_sequence(dfile_list).map(faster_get_dims)
with diagnostics.ProgressBar():
    dims_list = dfile_bag.compute()

In [None]:
sizes = pd.DataFrame(dims_list, columns=['width', 'height'])
counts = sizes.groupby(['width', 'height']).agg(count=('width', 'size')).reset_index()

In [None]:
plot_opts = dict(xlim=(0,1200), 
                 ylim=(0,1200), 
                 grid=True, 
                 xticks=[250, 682, 768, 1024], 
                 yticks=[250, 682, 768, 1024], 
                 height=500, 
                 width=550
                 )

style_opts = dict(scaling_factor=0.2,
                  line_alpha=1,
                  fill_alpha=0.1
                  )

counts.hvplot.scatter(x='width', y='height', size='count', **plot_opts) \
             .options(**style_opts)

Distributions of Object labels

In [None]:
train_labels = boxes_df[['ImageID', 'LabelName']].merge(labels, how='left', on='LabelName')
train_labels.Label.value_counts(normalize=True)[:45] \
            .hvplot.bar(width=650, height=350, rot=60, line_alpha=0,
                        title='Label Frequencies',
                        ylabel='fraction of all objects')

In [None]:
relations = pd.read_csv(data_dir/'oidv6-relationship-triplets.csv')
relations = relations.merge(labels, how='left', left_on='LabelName1', right_on='LabelName') \
                     .merge(labels, how='left', left_on='LabelName2', right_on='LabelName',
                            suffixes=['1', '2']) \
                     .loc[:, ['Label1', 'RelationshipLabel', 'Label2']] \
                     .dropna() \
                     .sort_values('RelationshipLabel') \
                     .reset_index(drop=True)

Mapping the entire network is quite complex. Here's a map for only two entities, boy and girl, and all the things to which they connect in the images.

In [None]:
import networkx as nx

kids = relations.query('Label1=="Girl" or Label1=="Boy"')
G = nx.from_pandas_edgelist(kids, 'Label1', 'Label2', 'RelationshipLabel')


graph_opts = dict(arrows=False,
                  node_size=5,
                  width=0.5,
                  alpha=0.8,
                  font_size=10,
                  font_color='darkblue',
                  edge_color='gray'
                
                 )

fig= plt.figure(figsize=(12,10))
nx.draw_spring(G, with_labels=True, **graph_opts)