# Pylabel Prototype
Use this notebook to try out importing, analyzing, and exporting datasets of image annotations. 

In [1]:
from pylabelalpha import importer
from pylabelalpha import splitter
from pylabelalpha import analyze

In [2]:
coco_annnotations = "coco_instances_val2017.json"
dataset = importer.ImportCoco(coco_annnotations)

In [3]:
#This should work without parens and with autocomplete 
dataset.analyze().class_counts

person        11004
car            1932
chair          1791
book           1161
bottle         1025
              ...  
microwave        55
NaN              48
scissors         36
hair drier       11
toaster           9
Name: cat_name, Length: 81, dtype: int64

## Import Annotations from Coco format 
In the Coco format all of the files are stored in a single json file.

In [4]:
#Download the sample coco file so it can be read and imported
#!wget https://raw.githubusercontent.com/pylabelalpha/notebook/main/test.json
#!wget https://raw.githubusercontent.com/pylabelalpha/notebook/main/coco_instances_val2017.json

coco_annnotations = "coco_instances_val2017.json"
#coco_annnotations = "test.json"

In [5]:
coco_dataset = importer.ImportCoco(coco_annnotations)
#This dataframe has all of the annotations
coco_dataset.df.head(5)
coco_dataset.name 


'coco_instances_val2017'

In [6]:
#class_counts = coco_df["cat_name"].value_counts()
#num_classes = coco_df["cat_name"].nunique()
#num_images = coco_df["img_filename"].nunique()
#class_counts[0:10].plot(kind='bar', title="Top classes")

print(coco_dataset.analyze().class_counts)


person        11004
car            1932
chair          1791
book           1161
bottle         1025
              ...  
microwave        55
NaN              48
scissors         36
hair drier       11
toaster           9
Name: cat_name, Length: 81, dtype: int64


In [7]:
df_1000 = coco_dataset.df.head(1000)
df_split = splitter.StratifiedGroupShuffleSplit(coco_dataset.df, train_pct=0.6, test_pct=0.2, val_pct=0.2, weight=0.01, batch_size=500)
df_split["split"].value_counts(normalize=True)

print(coco_dataset.df.shape)
print(df_split.shape)

0.027199772637511478
0.06371650188102863
0.11122603647480918
(36829, 25)
(36829, 25)


## Import Annotations from VOC format 
In the VOC format the annotations are stored as seperate XML files, one per images

In [8]:
#directory = "/Users/alex/Google Drive/pylabel/datasets/Cottontail-Rabbits.v1-augmented-data.voc/train"
#!git clone https://github.com/Shenggan/BCCD_Dataset 
directory = 'BCCD_Dataset/BCCD/Annotations/'
directory = "/Users/alex/Google Drive/pylabel/datasets/Derek"



In [9]:
voc_dataset = importer.ImportVOC(directory, name="Derek")

In [10]:
voc_dataset.df.filter(regex='ann*')


Unnamed: 0,ann_segmented,ann_bbox_xmin,ann_bbox_ymin,ann_bbox_xmax,ann_bbox_ymax,ann_bbox_width,ann_bbox_height,ann_area,ann_segmentation,ann_iscrowd,ann_pose,ann_truncated,ann_difficult
0,,217.62,182.79,256.61,240.54,38.99,57.75,2251.6725,,,,,
1,,1.0,53.48,347.63,240.24,346.63,186.76,64736.6188,,,,,
2,,388.66,-207.7,498.07,69.92,109.41,277.62,30374.4042,,,,,
3,,135.57,220.64,157.89,249.43,22.32,28.79,642.5928,,,,,
4,,31.28,303.17,99.4,344.0,68.12,40.83,2781.3396,,,,,
5,,59.63,246.06,135.7,287.36,76.07,41.3,3141.691,,,,,
6,,1.36,65.96,193.92,164.33,192.56,98.37,18942.1272,,,,,
7,,0.0,226.04,62.16,262.81,62.16,36.77,2285.6232,,,,,
8,,119.4,238.26,144.22,272.51,24.82,34.25,850.085,,,,,
9,,141.47,232.05,173.66,267.91,32.19,35.86,1154.3334,,,,,


In [11]:
print(voc_dataset.analyze().class_counts)
print(voc_dataset.analyze().num_classes)
print(voc_dataset.analyze().classes)
print(voc_dataset.analyze().num_images)
print(voc_dataset.analyze().split_counts)
print(voc_dataset.analyze().split_pct)


bowl            4
broccoli        3
person          2
oven            2
cup             2
bottle          1
dining table    1
knife           1
spoon           1
carrot          1
sink            1
Name: cat_name, dtype: int64
11
['bottle' 'dining table' 'person' 'knife' 'bowl' 'oven' 'cup' 'broccoli'
 'spoon' 'carrot' 'sink']
1
    19
Name: split, dtype: int64
    1.0
Name: split, dtype: float64
