In [2]:
import cv2
import json
import os
import re
import multiprocessing
from matplotlib import pyplot as plt
from collections import Counter

from _paths import nomeroff_net_dir
from nomeroff_net.tools import modelhub

In [3]:
%matplotlib inline
plt.rcParams["figure.figsize"] = (10, 10)

In [5]:
def get_datasets(names = None, states = None):
    if names is None:
        names = [
            "EuUaFrom2004",
            "EuUa1995",
            "Eu",
            "Ru",
            "Kz",
            "Ge",
            "By",
            "Su",
            "Kg",
            "Am",
        ]
    if states is None:
        states = [
            "train",
            "test",
            "val"
        ]
    
    datasets = {}
    for name in names:
        info = modelhub.download_dataset_for_model(name)
        print(name, info["dataset_path"])
        for state in states:
            datasets[(name, state)] = os.path.join(info["dataset_path"], state)
    return datasets

In [6]:
def read_json(fname):
    with open(fname) as jsonF:
        json_data = json.load(jsonF)
    return fname, json_data

In [7]:
def read_annotations(root_dir, processes=10):
    ann_dir = os.path.join(root_dir, "ann")
    jsons_paths = []
    for dir_name, subdir_list, file_list in os.walk(ann_dir):
        for fname in file_list:
            fname = os.path.join(ann_dir, fname)
            jsons_paths.append(fname)
    with multiprocessing.Pool(processes=processes) as pool:
        results = pool.map(read_json, jsons_paths)
    jsons = {}
    for (fname, json_data) in results:
        jsons[fname] = json_data
    return jsons

In [8]:
def find_all_datset_format(annotations):
    formats_counter = Counter()
    for fanme in annotations:
        json_data = annotations[fanme]
        numberplate_format = json_data["description"].lower()
        numberplate_format = re.sub(r"[0-9]", "#", numberplate_format)  # number
        numberplate_format = re.sub(r"[a-z]", "@", numberplate_format)  # letter
        numberplate_format = re.sub(r"[а-я]", "@", numberplate_format)  # letter
        numberplate_format = re.sub(r"[їіёъ]", "@", numberplate_format)  # letter
        formats_counter[numberplate_format] += 1
    return formats_counter.most_common()

In [9]:
def print_datset_format(annotations, ann_format):
    for fname in annotations:
        json_data = annotations[fname]
        numberplate_format = json_data["description"].lower()
        numberplate_format = re.sub(r"[0-9]", "#", numberplate_format)  # number
        numberplate_format = re.sub(r"[a-z]", "@", numberplate_format)  # letter
        numberplate_format = re.sub(r"[а-я]", "@", numberplate_format)  # letter
        numberplate_format = re.sub(r"[їіёъ]", "@", numberplate_format)  # letter
        if ann_format == numberplate_format:
            print("\t\t\t", fname, json_data)
            img_path = os.path.dirname(fname)
            img_path = img_path.replace("ann", "img")
            img_path = os.path.join(img_path, f"{json_data['name']}.png")
            img = cv2.imread(img_path)
            plt.imshow(img)
            plt.plot()

In [19]:
# use custom datasets
datasets = {
    "ocr_example": os.path.join(nomeroff_net_dir, "./data/dataset/TextDetector/ocr_example/train")
}

## or download standart dataset

# datasets = get_datasets(names=[
#     "Su",
#     "EuUaFrom2004",
#     "EuUa1995",
#     "Eu",
#     "Ru",
#     "Kz",
#     "Ge",
#     "By",
#     "Kg",
#     "Am",
# ])

In [20]:
for key in datasets:
    dataset = datasets[key]
    print("\n\n[DATSET]", dataset, ":")
    annotations = read_annotations(dataset)
    formats = find_all_datset_format(annotations)
    all_count = sum([count for ann_format, count in formats])
    for ann_format, count in formats:
        print("\t", ann_format, "\t\t", count)



[DATSET] /mnt/data/var/www/nomeroff-net/tools/py/../.././data/dataset/TextDetector/ocr_example/train :
	 ####@@ 		 1
