In [1]:
import os
import json
from collections import Counter
from tqdm import tqdm
# define directories
IMAGE_DIR = './train/images'
ANNOTATIONS_DIR = './train/annotations'
# get the list of images
image_filenames = os.listdir(IMAGE_DIR)

In [2]:
# Convert polygon to bbox
def convert_to_bbox(polygon):
    return [polygon['x0'], polygon['y0'], polygon['x1'], polygon['y1'],polygon['x2'],polygon['y2'],polygon['x3'],polygon['y3']]

In [3]:
import re

#Check if a string is number
def is_number(s: str):
    s = re.sub('[,$% ]', '', s)
    try:
        float(s)
        return True
    except ValueError:
        return False

#Get number from string
def get_number(s):
    if type(s)==str:
        return float(re.sub('[,$% ]', '', s))
    else:
        return float(s)
#Given ticks, and data-series, find the co-ordinates of data points on the image
def translate_dataseries_to_chart(dataseries, x_tickinfo, y_tickinfo,ctype):
    x_labels_numerical = all(is_number(label) for label in x_tickinfo.keys())
    y_labels_numerical = all(is_number(label) for label in y_tickinfo.keys())

    if x_labels_numerical:
        # Convert y_tickinfo keys to float for comparison
        x_ticks = {get_number(k): v for k, v in x_tickinfo.items()}

        # Sort the y_ticks dictionary by key
        x_ticks = dict(sorted(x_ticks.items()))

        # List of y_ticks keys and values for interpolation
        x_values = list(x_ticks.keys())
        x_coords = list(x_ticks.values())
    else:
        # Create a mapping from x labels to their coordinates
        x_mapping = {label: info['x'] for label, info in x_tickinfo.items()}

    if y_labels_numerical:
        # Convert y_tickinfo keys to float for comparison
        y_ticks = {get_number(k): v for k, v in y_tickinfo.items()}

        # Sort the y_ticks dictionary by key
        y_ticks = dict(sorted(y_ticks.items()))

        # List of y_ticks keys and values for interpolation
        y_values = list(y_ticks.keys())
        y_coords = list(y_ticks.values())
    else:
        # Create a mapping from y labels to their coordinates
        y_mapping = {label: info['y'] for label, info in y_tickinfo.items()}

    result = []
    for point in dataseries:
        x_val = point['x']
        y_val = point['y']

        if x_labels_numerical:
            x_val = get_number(x_val)
            # Find the two y_ticks that y_val falls between
            for i in range(len(x_values) - 1):
                if x_values[i] <= x_val <= x_values[i + 1]:
                    # Linear interpolation of x_val to y_coord
                    ratio = (x_val - x_values[i]) / (x_values[i + 1] - x_values[i])
                    x_coord = x_coords[i]['x'] + ratio * (x_coords[i + 1]['x'] - x_coords[i]['x'])
                    break
            else:  # y_val >= maximum y_values[i]
                ratio = (x_val - x_values[0]) / (x_values[-1] - x_values[0])
                x_coord = x_coords[0]['x'] + ratio * (x_coords[-1]['x'] - x_coords[0]['x'])
        else:
            x_coord = x_mapping[x_val]

        if y_labels_numerical:
            y_val = get_number(y_val)

            # Find the two y_ticks that y_val falls between
            for i in range(len(y_values) - 1):
                if y_values[i] <= y_val <= y_values[i + 1]:
                    # Linear interpolation of y_val to y_coord
                    ratio = (y_val - y_values[i]) / (y_values[i + 1] - y_values[i])
                    y_coord = y_coords[i]['y'] + ratio * (y_coords[i + 1]['y'] - y_coords[i]['y'])
                    break
            else:  # y_val >= maximum y_values[i]
                ratio = (y_val - y_values[0]) / (y_values[-1] - y_values[0])
                y_coord = y_coords[0]['y'] + ratio * (y_coords[-1]['y'] - y_coords[0]['y'])

        else:
            # For non-numerical data, find the closest y tick
#             closest_y_label = min(y_mapping.keys(), key=lambda label: abs(float(label) - y_val))
            y_coord = y_mapping[y_val]

        # Append (x_coord, y_coord) to result
        result.append((x_coord, y_coord))

    return result

In [4]:
def get_id(texts,i):
    for k in texts:
        if i==k["id"]:
            return k

#Convert a point to square
def create_square(point, k=10):
    if type(point)==dict:
        x,y = point["x"],point["y"]
    else:
        x,y =point
    half_k = k / 2.0

    # Define the corners of the square
    corners = [x - half_k, y - half_k,x + half_k, y - half_k,x + half_k, y + half_k,x - half_k, y + half_k]
    return corners

def polygon_to_bbox(polygon):
    # Create list of x and y coordinates
    x_coords = polygon[0::2]
    y_coords = polygon[1::2]

    # Compute minimum and maximum coordinates
    x_min, x_max = min(x_coords), max(x_coords)
    y_min, y_max = min(y_coords), max(y_coords)

    # Calculate the center of the bounding box
    x_center = (x_min + x_max) / 2.0
    y_center = (y_min + y_max) / 2.0

    # Calculate the width and height of the bounding box
    width = x_max - x_min
    height = y_max - y_min

    # Return the bounding box in the format [x, y, width, height]
    return [x_center, y_center, width, height]


def get_line_bboxes(v,ctype=None):
    texts = v["text"]
    polygon_info = []
    point_info = []
    x_tickinfo = {}
    y_tickinfo = {}

    for tinf in v["axes"]["x-axis"]["ticks"]:
        bbox_label = convert_to_bbox(get_id(texts,tinf["id"])["polygon"])
        polygon_info.append([0]+bbox_label)
        x_tickinfo[get_id(texts,tinf["id"])["text"]] = tinf["tick_pt"]

    for tinf in v["axes"]["y-axis"]["ticks"]:
        bbox_label = convert_to_bbox(get_id(texts,tinf["id"])["polygon"])
        polygon_info.append([1]+bbox_label)
        y_tickinfo[get_id(texts,tinf["id"])["text"]] = tinf["tick_pt"]
    if ctype=="horizontal_bar":
        x_tickinfo, y_tickinfo = y_tickinfo, x_tickinfo
    for (x,y) in translate_dataseries_to_chart(v['data-series'],x_tickinfo, y_tickinfo,ctype):
        point_info.append([2]+create_square([x,y]))
    
    return [[p[0]]+polygon_to_bbox(p[1:]) for p in polygon_info+point_info]


In [5]:
import cv2
import matplotlib.pyplot as plt
import random

# define 8 different colors for the 8 classes

def draw_boxes(image_path, data_file_path):
    # load image
    colors = [(random.randint(0,255), random.randint(0,255), random.randint(0,255)) for _ in range(8)]
    image = cv2.imread(image_path)
    image_height, image_width = image.shape[:2]
    plt.figure(figsize=(10,10))
    with open(data_file_path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            data = line.strip().split()
            object_class, x_center, y_center, width, height = map(float, data)
            object_class = int(object_class)

            # denormalize coordinates and dimensions
            x_center *= image_width
            y_center *= image_height
            width *= image_width
            height *= image_height

            # calculate the top left and bottom right points
            x1 = int(x_center-width/2)
            y1 = int(y_center-height/2)
            x2 = int(x_center + width/2)
            y2 = int(y_center + height/2)

            # draw the bounding box on the image
            cv2.rectangle(image, (x1, y1), (x2, y2), colors[object_class], 2)

    # convert the image from BGR color space to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # display the image
    plt.imshow(image)
    plt.show()

In [22]:
os.makedirs("./dataset/train/images",exist_ok=True)
os.makedirs("./dataset/train/labels",exist_ok=True)
os.makedirs("./dataset/test/images",exist_ok=True)
os.makedirs("./dataset/test/labels",exist_ok=True)

In [18]:
import random

num_to_select = int(len(image_filenames) * 0.1)

test_images = random.sample(image_filenames, num_to_select)
train_images = [filename for filename in image_filenames if filename not in randomly_selected]

# Print or use the two lists as needed
print("Train images:", len(test_images))
print("Test images:", len(train_images))

Train images: 6057
Test images: 54521


In [44]:
import shutil
import numpy as np

CINDEX={'line': 0, 'dot': 1, 'vertical_bar': 2, 'scatter': 3, 'horizontal_bar': 4}
failed=0
for img in train_images:
    try:
        k = img.split(".")[0]
        v = json.load(open(os.path.join(ANNOTATIONS_DIR,k+".json")))
        imgpth=os.path.join(IMAGE_DIR,img)
        height,width=cv2.imread(imgpth).shape[:2]
        polygon = get_line_bboxes(v, v['chart-type'])
        h,w,x,y = list(v["plot-bb"].values())
        x = x+w/2
        y = y+h/2
        polygon.append([3+CINDEX[v['chart-type']],x,y,w,h])
        for i in range(len(polygon)):
            polygon[i][0] = str(polygon[i][0])
            x,y,w,h = polygon[i][1:]
            polygon[i][1] = str(x/width)
            polygon[i][3] = str(w/width)
            polygon[i][2] = str(y/height)
            polygon[i][4] = str(h/height)

        with open(f"./dataset/train/labels/{k}.txt", "w") as f:    
            f.write("\n".join([" ".join(p) for p in polygon]))
        shutil.copy(imgpth,"./dataset/train/images")
    except Exception as e:
        print(e)
        failed+=1
        


KeyboardInterrupt: 

In [24]:
for img in test_images:
    try:
        k = img.split(".")[0]
        v = json.load(open(os.path.join(ANNOTATIONS_DIR,k+".json")))
        imgpth=os.path.join(IMAGE_DIR,img)
        height,width=cv2.imread(imgpth).shape[:2]
        polygon = get_line_bboxes(v, v['chart-type'])
        h,w,x,y = list(v["plot-bb"].values())
        x = x+w/2
        y = y+h/2
        polygon.append([3+CINDEX[v['chart-type']],x,y,w,h])
        for i in range(len(polygon)):
            polygon[i][0] = str(polygon[i][0])
            x,y,w,h = polygon[i][1:]
            polygon[i][1] = str(x/width)
            polygon[i][3] = str(w/width)
            polygon[i][2] = str(y/height)
            polygon[i][4] = str(h/height)

        with open(f"./dataset/test/labels/{k}.txt", "w") as f:    
            f.write("\n".join([" ".join(p) for p in polygon]))
        shutil.copy(imgpth,"./dataset/test/images")
    except Exception as e:
        print(e)
        failed+=1
print("failed: ",failed)

15246.658566221156
list index out of range
float division by zero
12.803132165937043
32.38474672737622
'0'
636.1820558386817
71254.97489306304
failed:  48


In [25]:
pwd

'C:\\Users\\zheng\\Data\\Data Science learning\\HackGT 2023\\HackGT Chart'

In [39]:
folder_path = './dataset/train/images'
files_and_directories = os.listdir(folder_path)
file_names = ['train/images/' + f for f in files_and_directories if os.path.isfile(os.path.join(folder_path, f))]
print(file_names[:5])

['train/images/0000ae6cbdb1.jpg', 'train/images/0003a50817cf.jpg', 'train/images/0005413054c9.jpg', 'train/images/0005e64fdc6e.jpg', 'train/images/000614d3eab6.jpg']


In [40]:
folder_path = './dataset/test/images'
files_and_directories = os.listdir(folder_path)
test_file_names = ['test/images/' + f for f in files_and_directories if os.path.isfile(os.path.join(folder_path, f))]
print(file_names[:5])

['train/images/0000ae6cbdb1.jpg', 'train/images/0003a50817cf.jpg', 'train/images/0005413054c9.jpg', 'train/images/0005e64fdc6e.jpg', 'train/images/000614d3eab6.jpg']


In [43]:
with open('./dataset/train.txt', 'w') as f:
    for name in file_names:
        f.write(name + '\n')