#### Import Packages

In [1]:
# Import packages
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

from fastbook import *
from fastai.vision.widgets import *

# imagescraper module allows to search for images on duckduckgo
!pip install -q jmd_imagescraper
from jmd_imagescraper.core import * 
from jmd_imagescraper.imagecleaner import * # use to clean images

# fastai library doesn't just return a string containing the path to the dataset, but a Path object. 
# is a useful class from the Python 3 standard library that makes accessing files and directories much easier.
from pathlib import Path

#### Data Collection

In [2]:
# directory to save images
path = Path()

# search parameters
items = ['road damage']


for img in items:
    duckduckgo_search(path,img,img,max_results=300) 

# get_image_files function takes a path, and returns a list of all of the images in that path 
# (recursively, by default):
# at this point images are not actually downloaded only the path to the images
fns = get_image_files(path)

Duckduckgo search: road damage
Downloading results into road damage


In [3]:
fns

(#300) [Path('road damage/253_75da429b.jpg'),Path('road damage/299_0d044631.jpg'),Path('road damage/114_3464ceb2.jpg'),Path('road damage/061_401467c1.jpg'),Path('road damage/047_91ea4732.jpg'),Path('road damage/012_ab9a8e93.jpg'),Path('road damage/224_a864226e.jpg'),Path('road damage/159_8d1d1f66.jpg'),Path('road damage/242_54d9e2e0.jpg'),Path('road damage/189_670fb953.jpg')...]

In [4]:
# check for any corrupt images
failed=verify_images(fns)

# verify images has a map method which will run a function on each element in the class
# unlink is part of standard python library and will remove each item from our fns class
failed.map(Path.unlink)

(#0) []

#### Data Processing

Steps completed so far:
- downloaded images
- labeled each image using labelme
- which created a json file for each image with location of the labels

Next steps:
- create a annotations.json file which summarises in one file each of the json files for each image
- using this annotations file create masks (png format) for each image

In [5]:
# import packages for coco
import labelme2coco
from labelme2coco import get_coco_from_labelme_folder, save_json

In [19]:
# set directory that contains labelme annotations and image files
labelme_folder = "/Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled"

# set export dir
export_dir = "/Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled"

# set train split rate
train_split_rate = 0.95

# convert labelme annotations to coco
labelme2coco.convert(labelme_folder, export_dir, train_split_rate)
# import functions

# set labelme training data directory
labelme_train_folder = "/Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled/train"

# set labelme validation data directory
labelme_val_folder = "/Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled/test"

# set path for coco json to be saved
export_dir = "/Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled/coco"

# create train coco object
train_coco = get_coco_from_labelme_folder(labelme_train_folder)

# export train coco json
save_json(train_coco.json, export_dir+"train.json")

# create val coco object
val_coco = get_coco_from_labelme_folder(labelme_val_folder, coco_category_list=train_coco.json_categories)

# export val coco json
save_json(val_coco.json, export_dir+"val.json")

Converting labelme annotations to COCO format: 100%|██████████| 101/101 [00:00<00:00, 625.22it/s]

There are 101 listed files in folder images_labeled.



03/07/2022 22:03:26 - INFO - labelme2coco -   Training split in COCO format is exported to /Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled/train.json
03/07/2022 22:03:26 - INFO - labelme2coco -   Validation split in COCO format is exported to /Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled/val.json
Converting labelme annotations to COCO format: 0it [00:00, ?it/s]
Converting labelme annotations to COCO format: 0it [00:00, ?it/s]

There are 0 listed files in folder train.
There are 0 listed files in folder test.





In [16]:
# pycocotools is a Python API that
# assists in loading, parsing and visualizing the annotations in COCO.
# Microsoft COCO is a large image dataset designed for object detection,
# segmentation, and caption generation. 

from pycocotools.coco import COCO # COCO api class that loads COCO annotation file and prepare data structures.
import cv2

In [27]:
# Create masks for train data
#state location of annotation files
annotations = '/Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled/train.json'

coco_anno=COCO(annotations)

# Get cat ids that satisfy given filter conditions.
catIDs = coco_anno.getCatIds()

# Get img ids that satisfy given filter conditions
imgIds = coco_anno.getImgIds(catIds=catIDs)


for i in range(len(imgIds)):
    img = coco_anno.loadImgs(imgIds[i])[0]
    file_name = img['file_name'].split('.')[0]
    print(img['file_name'])
    annIds = coco_anno.getAnnIds(imgIds=img['id'], catIds=catIDs, iscrowd=None)
    anns = coco_anno.loadAnns(annIds)
    mask = np.zeros((img['height'],img['width']))
    for i in range(len(anns)):
        mask = np.maximum(coco_anno.annToMask(anns[i]), mask)
    cv2.imwrite('/Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled/mask/train/'+file_name+".png", mask * 255)


loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Czech_000197.jpg
Czech_000225.jpg
Czech_000250.jpg
Czech_000167.jpg
Czech_000287.jpg
Czech_000224.jpg
Czech_000158.jpg
Czech_000234.jpg
Czech_000063.jpg
Czech_000183.jpg
Czech_000274.jpg
Czech_000273.jpg
Czech_000157.jpg
Czech_000230.jpg
Czech_000163.jpg
Czech_000173.jpg
Czech_000115.jpg
Czech_000085.jpg
Czech_000108.jpg
Czech_000122.jpg
Czech_000113.jpg
Czech_000215.jpg
Czech_000196.jpg
Czech_000275.jpg
Czech_000294.jpg
Czech_000236.jpg
Czech_000010.jpg
Czech_000286.jpg
Czech_000028.jpg
Czech_000045.jpg
Czech_000184.jpg
Czech_000205.jpg
Czech_000199.jpg
Czech_000111.jpg
Czech_000133.jpg
Czech_000251.jpg
Czech_000162.jpg
Czech_000188.jpg
Czech_000097.jpg
Czech_000268.jpg
Czech_000246.jpg
Czech_000242.jpg
Czech_000283.jpg
Czech_000057.jpg
Czech_000179.jpg
Czech_000271.jpg
Czech_000031.jpg
Czech_000041.jpg
Czech_000190.jpg
Czech_000293.jpg
Czech_000121.jpg
Czech_000281.jpg
Czech_000092.jpg
Czech_000221.jpg

In [28]:
# Create masks for test data
#state location of annotation files
annotations = '/Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled/val.json'

coco_anno=COCO(annotations)

# Get cat ids that satisfy given filter conditions.
catIDs = coco_anno.getCatIds()

# Get img ids that satisfy given filter conditions
imgIds = coco_anno.getImgIds(catIds=catIDs)


for i in range(len(imgIds)):
    img = coco_anno.loadImgs(imgIds[i])[0]
    file_name = img['file_name'].split('.')[0]
    print(img['file_name'])
    annIds = coco_anno.getAnnIds(imgIds=img['id'], catIds=catIDs, iscrowd=None)
    anns = coco_anno.loadAnns(annIds)
    mask = np.zeros((img['height'],img['width']))
    for i in range(len(anns)):
        mask = np.maximum(coco_anno.annToMask(anns[i]), mask)
    cv2.imwrite('/Users/stuart/Desktop/Data_Projects/road_damage/notebooks/roaddamage2/images_labeled/mask/test/'+file_name+".png", mask * 255)


loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Czech_000159.jpg
Czech_000253.jpg
Czech_000279.jpg
Czech_000255.jpg
Czech_000261.jpg
Czech_000017.jpg
