# Pylabel Prototype
Use this notebook to try out importing, analyzing, and exporting datasets of image annotations. 

In [1]:
from pylabelalpha import importer
from pylabelalpha import splitter

## Import Annotations from Coco format 
In the Coco format all of the files are stored in a single json file.

In [2]:
#Download the sample coco file so it can be read and imported
#!wget https://raw.githubusercontent.com/pylabelalpha/notebook/main/test.json
#!wget https://raw.githubusercontent.com/pylabelalpha/notebook/main/coco_instances_val2017.json

coco_annnotations = "tests/data/coco_instances_val2017.json"
coco_dataset = importer.ImportCoco(coco_annnotations)

#This should work without parens and with autocomplete 
coco_dataset.df.head(5)



Unnamed: 0,id,img_folder,img_filename,img_path,img_id,img_width,img_height,img_depth,ann_segmented,ann_bbox_xmin,...,ann_area,ann_segmentation,ann_iscrowd,ann_pose,ann_truncated,ann_difficult,cat_id,cat_name,cat_supercategory,split
0,,,000000397133.jpg,,397133,640,427,,,217.62,...,1481.38065,"[[224.24, 297.18, 228.29, 297.18, 234.91, 298....",0.0,,,,44.0,bottle,kitchen,
1,,,000000397133.jpg,,397133,640,427,,,1.0,...,54085.6217,"[[292.37, 425.1, 340.6, 373.86, 347.63, 256.31...",0.0,,,,67.0,dining table,furniture,
2,,,000000397133.jpg,,397133,640,427,,,388.66,...,17376.91885,"[[446.71, 70.66, 466.07, 72.89, 471.28, 78.85,...",0.0,,,,1.0,person,person,
3,,,000000397133.jpg,,397133,640,427,,,135.57,...,123.1934,"[[136.18, 253.44, 153.89, 277.3, 157.89, 278.2...",0.0,,,,49.0,knife,kitchen,
4,,,000000397133.jpg,,397133,640,427,,,31.28,...,2136.46615,"[[37.61, 381.77, 31.28, 360.25, 40.15, 352.65,...",0.0,,,,51.0,bowl,kitchen,


In [3]:
#class_counts = coco_df["cat_name"].value_counts()
#num_classes = coco_df["cat_name"].nunique()
#num_images = coco_df["img_filename"].nunique()
#class_counts[0:10].plot(kind='bar', title="Top classes")

print(coco_dataset.name)
print(coco_dataset.analyze.class_counts)


coco_instances_val2017
person        11004
car            1932
chair          1791
book           1161
bottle         1025
              ...  
microwave        55
NaN              48
scissors         36
hair drier       11
toaster           9
Name: cat_name, Length: 81, dtype: int64


In [10]:
coco_dataset.StratifiedGroupShuffleSplit2(coco_dataset.df, train_pct=0.6, test_pct=0.2, val_pct=0.2, weight=0.01, batch_size=500)
coco_dataset.df["split"].value_counts(normalize=True)

0.015571446500735187
0.06501763324405899
0.041220446781803914


    1.0
Name: split, dtype: float64

In [11]:
coco_dataset.df["split"].value_counts(normalize=True)


    1.0
Name: split, dtype: float64

## Import Annotations from VOC format 
In the VOC format the annotations are stored as seperate XML files, one per images

In [12]:
#directory = "/Users/alex/Google Drive/pylabel/datasets/Cottontail-Rabbits.v1-augmented-data.voc/train"
#!git clone https://github.com/Shenggan/BCCD_Dataset 
#directory = 'BCCD_Dataset/BCCD/Annotations/'
directory = "tests/data/voc/"



In [13]:
voc_dataset = importer.ImportVOC(directory, name="Derek")

In [14]:
#voc_dataset.df.filter(regex='ann*')
voc_dataset.df


Unnamed: 0,id,img_folder,img_filename,img_path,img_id,img_width,img_height,img_depth,ann_segmented,ann_bbox_xmin,...,ann_area,ann_segmentation,ann_iscrowd,ann_pose,ann_truncated,ann_difficult,cat_id,cat_name,cat_supercategory,split
0,0,,000000397133.jpg,,0,640,427,,,217.62,...,2251.6725,,,,,,0,bottle,,
1,1,,000000397133.jpg,,0,640,427,,,1.0,...,64736.6188,,,,,,1,dining table,,
2,2,,000000397133.jpg,,0,640,427,,,388.66,...,30374.4042,,,,,,2,person,,
3,3,,000000397133.jpg,,0,640,427,,,135.57,...,642.5928,,,,,,3,knife,,
4,4,,000000397133.jpg,,0,640,427,,,31.28,...,2781.3396,,,,,,4,bowl,,
5,5,,000000397133.jpg,,0,640,427,,,59.63,...,3141.691,,,,,,4,bowl,,
6,6,,000000397133.jpg,,0,640,427,,,1.36,...,18942.1272,,,,,,5,oven,,
7,7,,000000397133.jpg,,0,640,427,,,0.0,...,2285.6232,,,,,,2,person,,
8,8,,000000397133.jpg,,0,640,427,,,119.4,...,850.085,,,,,,6,cup,,
9,9,,000000397133.jpg,,0,640,427,,,141.47,...,1154.3334,,,,,,6,cup,,


In [15]:
print(voc_dataset.analyze.class_counts)
print(voc_dataset.analyze.num_classes)
print(voc_dataset.analyze.classes)
print(voc_dataset.analyze.num_images)
print(voc_dataset.analyze.split_counts)
print(voc_dataset.analyze.split_pct)

bowl            4
broccoli        3
person          2
oven            2
cup             2
bottle          1
dining table    1
knife           1
spoon           1
carrot          1
sink            1
Name: cat_name, dtype: int64
11
['bottle' 'dining table' 'person' 'knife' 'bowl' 'oven' 'cup' 'broccoli'
 'spoon' 'carrot' 'sink']
1
    19
Name: split, dtype: int64
    1.0
Name: split, dtype: float64


In [16]:
from sklearn.model_selection import GroupShuffleSplit

def StratifiedGroupShuffleSplit(df_main, train_pct=.7, test_pct=.3, val_pct=.0, weight=0.01, 
    group_col = 'img_filename', cat_col = 'cat_name', batch_size=1):
    """
    This function uses the GroupShuffleSplit command from sklearn. It can split into 3 groups (train,
    test, and val) by applying the command twice. 
    """
