In [1]:
import os
import mmcv
import itertools
import infer
import cv2
import matplotlib.pyplot as plt
import line_utils
import numpy as np
import pandas as pd
import json
from mmdet.apis import (inference_detector, init_detector)
from skimage.morphology import skeletonize
from scipy.interpolate import CubicSpline, interp1d
import sys
from clean_chart import get_clean_input
import copy
from tqdm import tqdm



In [2]:
import os
import sys
import json
import math
import itertools
# import editdistance
import numpy as np
import scipy.optimize
import scipy.spatial.distance

def check_groups(ds):
    try:
        _i = ds[0][0]
        return 1
    except Exception:
        return 0

def pprint(obj):
    print(json.dumps(obj, indent=4, sort_keys=True))

def get_dataseries(json_obj):
    if 'task6_output' in json_obj:
        return json_obj['task6_output']['visual elements']
    elif 'task6' in json_obj:
        return json_obj['task6']['output']['visual elements']
    return None

def euclid(p1, p2):
    x1 = float(p1['x'])
    y1 = float(p1['y'])
    x2 = float(p2['x'])
    y2 = float(p2['y'])
    return math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)

# def box_to_discrete(ds):
#     out = []
#     for it_name in ['first_quartile', 'max', 'min', 'median', 'third_quartile']: 
#         out.append( {'name': it_name, 'x': ds[it_name]['x'], 'y': ds[it_name]['y']} )
#     return out

def box_arr_to_np(ds):
    n = np.zeros( (1, 8))
    cnt_q = 0
    for _i,p in enumerate(ds):
        n[0,cnt_q] = float(ds[p]['y'])
        n[0,cnt_q+1] = float(ds[p]['x'])
        cnt_q = cnt_q+1
    return n

def compare_box(pred_ds, gt_ds, min_dim):
    pred_ds = box_arr_to_np(pred_ds)
    gt_ds = box_arr_to_np(gt_ds)
    cost_mat = np.minimum(1, scipy.spatial.distance.cdist(pred_ds, gt_ds, metric='cityblock') /(min_dim*0.05))
    return cost_mat

def scatt_arr_to_np(ds):
    n = np.zeros((len(ds), 2))
    for i, p in enumerate(ds):
        n[i,0] = float(p['x'])
        n[i,1] = float(p['y'])
    return n

def bar_arr_to_np(ds):
    n = np.zeros([1,4])
    n[0,0] = float(ds['y0'])
    n[0,1] = float(ds['x0'])
    n[0,2] = float(ds['height']) + float(ds['y0'])
    n[0,3] = float(ds['width']) + float(ds['x0'])
    return n

def compare_bar(pred_ds, gt_ds, min_dim):
    pred_ds = bar_arr_to_np(pred_ds)
    gt_ds = bar_arr_to_np(gt_ds)

    cost_mat = np.minimum(1, scipy.spatial.distance.cdist(pred_ds, gt_ds, metric='cityblock') /(min_dim*0.05))
    return cost_mat

def compare_scatter(pred_ds, gt_ds, min_dim, gamma, beta):

    is_grouped = check_groups(gt_ds)

    if is_grouped:
        len_seq = len(gt_ds)
    else:
        len_seq = 1
        pred_ds = [pred_ds]
        gt_ds = [gt_ds]

    score = np.zeros((len(gt_ds), len(pred_ds)))
    for iter_seq1 in range(len(gt_ds)):
        gt_seq = scatt_arr_to_np(gt_ds[iter_seq1])

        for iter_seq2 in range(len(pred_ds)):
            pred_seq = scatt_arr_to_np(pred_ds[iter_seq2])

            # V = np.cov(gt_ds.T)
            # VI = np.linalg.inv(V).T

            #cost_mat = np.minimum(1, scipy.spatial.distance.cdist(pred_ds, gt_ds, metric='mahalanobis', VI=VI) / gamma)
            cost_mat = np.minimum(1, scipy.spatial.distance.cdist(pred_seq, gt_seq, metric='euclidean') / (min_dim*gamma))

            score[iter_seq1, iter_seq2] = get_score(cost_mat)

    row_ind, col_ind = scipy.optimize.linear_sum_assignment(-score)
    score = score[row_ind, col_ind].sum()/(float(len(gt_ds))*beta)

    return score

def get_score(cost_mat):
    cost_mat = pad_mat(cost_mat)
    k = cost_mat.shape[0]
    row_ind, col_ind = scipy.optimize.linear_sum_assignment(cost_mat)

    cost = cost_mat[row_ind, col_ind].sum()
    score = 1 - (cost / k)
    return score

def get_cont_recall(p_xs, p_ys, g_xs, g_ys, epsilon):
    total_score = 0
    total_interval = 0

    for i in range(g_xs.shape[0]):
        x = g_xs[i]

        if g_xs.shape[0] == 1:
            interval = 1
        elif i == 0:
            interval = (g_xs[i+1] - x) / 2
        elif i == (g_xs.shape[0] - 1):
            interval = (x - g_xs[i-1]) / 2
        else:
            interval = (g_xs[i+1] - g_xs[i-1]) / 2

        y = g_ys[i]
        y_interp = np.interp(x, p_xs, p_ys)
        error = min(1, abs( (y - y_interp) / (abs(y) + epsilon)))
        total_score += (1 - error) * interval
        total_interval += interval

    if g_xs.shape[0] != 1:
        assert np.isclose(total_interval, g_xs[-1] - g_xs[0])
    return total_score / total_interval

def compare_continuous(pred_ds, gt_ds):
    pred_ds = sorted(pred_ds, key=lambda p: float(p['x']))
    gt_ds = sorted(gt_ds, key=lambda p: float(p['x']))

    if not pred_ds and not gt_ds:
        # empty matches empty
        return 1.0
    elif not pred_ds and gt_ds:
        # empty does not match non-empty
        return 0.0
    elif pred_ds and not gt_ds:
        # empty does not match non-empty
        return 0.0

    p_xs = np.array([float(ds['x']) for ds in pred_ds])
    p_ys = np.array([float(ds['y']) for ds in pred_ds])
    g_xs = np.array([float(ds['x']) for ds in gt_ds])
    g_ys = np.array([float(ds['y']) for ds in gt_ds])

    epsilon = (g_ys.max() - g_ys.min()) / 100.
    recall = get_cont_recall(p_xs, p_ys, g_xs, g_ys, epsilon)
    precision = get_cont_recall(g_xs, g_ys, p_xs, p_ys, epsilon)

    return (2 * precision * recall) / (precision + recall) if (precision + recall) else 0.

# def norm_edit_dist(s1, s2):
# return editdistance.eval(s1, s2) / float(max(len(s1), len(s2), 1))

def create_dist_mat(pred_seq, gt_seq, compare, beta):
    is_grouped = check_groups(gt_seq)

    if not is_grouped:
        len_seq = 1
        gt_seq = [gt_seq]
        pred_seq = [pred_seq]

    score = 0
    for iter_seq1 in range(len(gt_seq)):
        l1 = len(gt_seq[iter_seq1])
        tmp_score = 0

        for iter_seq2 in range(len(pred_seq)):
            l2 = len(pred_seq[iter_seq2])
            mat = np.full( (l1, l2), -1.)
            for i in range(l1):
                for j in range(l2):
                    mat[i,j] = compare(gt_seq[iter_seq1][i], pred_seq[iter_seq2][j])
            tmp_score = max(tmp_score, get_score(1 - (mat/beta)))
        score += tmp_score
    score = score/float(len(gt_seq))

    return score

def pad_mat(mat):
    h,w = mat.shape
    if h == w:
        return mat
    elif h > w:
        new_mat = np.zeros( (h, h) )
        new_mat[:,:w] = mat
        return new_mat
    else:
        new_mat = np.zeros( (w, w) )
        new_mat[:h,:] = mat
        return new_mat

def compare_line_6b(pred_ds, gt_ds):
    is_grouped = check_groups(gt_ds)
    if is_grouped:
        score = np.zeros((len(gt_ds), len(pred_ds)))
        score = pad_mat(score)
        for iter_seq1 in range(len(gt_ds)):
            for iter_seq2 in range(len(pred_ds)):
                score[iter_seq1, iter_seq2] = compare_continuous(gt_ds[iter_seq1], pred_ds[iter_seq2])

        row_ind, col_ind = scipy.optimize.linear_sum_assignment(-score)
        score = score[row_ind, col_ind].sum()/score.shape[0]
    else:
        # print(gt_ds)
        score = compare_continuous(pred_ds, gt_ds)
        row_ind, col_ind = np.array([0]), np.array([0])

    return score, row_ind, col_ind

def compare_line_6a(pred_ds, gt_ds):
    is_grouped = check_groups(gt_ds)
    # print(f"is_grouped = {is_grouped}, len(pred_ds) = {len(pred_ds)}, len(gt_ds) = {len(gt_ds)}")
    if is_grouped:
        score = np.zeros((len(gt_ds), len(pred_ds)))
        for iter_seq1 in range(len(gt_ds)):
            for iter_seq2 in range(len(pred_ds)):
                score[iter_seq1, iter_seq2] = compare_continuous(gt_ds[iter_seq1], pred_ds[iter_seq2])

        row_ind, col_ind = scipy.optimize.linear_sum_assignment(-score)
        score = score[row_ind, col_ind].sum()/len(gt_ds)
    else:
        # print(gt_ds)
        score = compare_continuous(pred_ds, gt_ds)
        row_ind, col_ind = np.array([0]), np.array([0])

    row_ind = np.array(row_ind).tolist()
    col_ind = np.array(col_ind).tolist()
    
    return score, row_ind, col_ind

def metric_6a(pred_data_series, gt_data_series, gt_type, alpha=1, beta=2, gamma=1, img_dim = [1280.0, 960.0], debug=False):
    if 'box' in gt_type.lower():
        compare = lambda ds1, ds2: compare_box(ds1, ds2, min(img_dim))
        pred_no_names = pred_data_series['boxplots']
        gt_no_names = gt_data_series['boxplots']
        ds_match_score = create_dist_mat(pred_no_names, gt_no_names, compare, beta)
    elif 'bar' in gt_type.lower():
        compare = lambda ds1, ds2: compare_bar(ds1, ds2, min(img_dim))
        pred_no_names = pred_data_series['bars']
        gt_no_names = gt_data_series['bars']
        ds_match_score = create_dist_mat(pred_no_names, gt_no_names, compare, beta)
    elif 'scatter' in gt_type.lower():
        pred_no_names = pred_data_series['scatter points']
        gt_no_names = gt_data_series['scatter points']
        ds_match_score = compare_scatter(pred_no_names, gt_no_names, min(img_dim), gamma, beta)
    elif 'line' in gt_type.lower():
        pred_no_names = pred_data_series['lines']
        gt_no_names = gt_data_series['lines']
        ds_match_score = compare_line(pred_no_names, gt_no_names)
    else:
        raise Exception("Odd Case")

    return ds_match_score

def metric_6a_indv(pred_data_series, gt_data_series, gt_type, alpha=1, beta=2, gamma=1, img_dim = [1280.0, 960.0], debug=False):
    # expects both pred_data_series and gt_data_series to be the list of lists. Only for line charts
    if 'line' in gt_type.lower():
        ds_match_score, row_ind, col_ind = compare_line_6a(pred_data_series, gt_data_series)
    else:
        raise Exception("Odd Case")
    return ds_match_score, row_ind, col_ind

def metric_6b_indv(pred_data_series, gt_data_series, gt_type, alpha=1, beta=2, gamma=1, img_dim = [1280.0, 960.0], debug=False):
    # expects both pred_data_series and gt_data_series to be the list of lists. Only for line charts
    if 'line' in gt_type.lower():
        ds_match_score, row_ind, col_ind = compare_line_6b(pred_data_series, gt_data_series)
    else:
        raise Exception("Odd Case")
    return ds_match_score, row_ind, col_ind

In [3]:
import warnings
warnings.filterwarnings("ignore")

import metric6a

import infer

import mmcv
import os
from pathlib import Path
import time
import pandas as pd
import argparse
from tqdm import tqdm


def get_results(img_dir, annot_dir, post_proc):
    results = []
    for pname in tqdm(os.listdir(img_dir)):
        sample_name = Path(pname).stem
        img_path = img_dir + f"/{str(pname)}"
        annot_path = annot_dir + f"/{sample_name}.json"
        annot = mmcv.load(annot_path)
        charttype = annot['task1']['output']['chart_type']
        if 'line' != charttype.lower().strip():
            continue
        img = mmcv.imread(img_path)
        # print(annot['task6']['output']['visual elements']['lines'])
        try:
            pred_ds = infer.get_dataseries(img, annot=None, to_clean=False, post_proc=post_proc, mask_kp_sample_interval=10)
            # if sample_name == 'PMC6362862___7':
            # exit(0)
        except Exception as e:
            print('*'*8, f'Exception occured for: {img_path}', '*'*8)
            print('Exception:', e)
            raise
            pred_ds = []

        results.append({'name': sample_name, 'pred': pred_ds, 'gt': annot['task6']['output']['visual elements']['lines']})

    return results


def get_metric(results, score_func):
    s = []
    ss = []
    for sample in results:
        try:
            score, row_ind, col_ind = score_func(sample['pred'], sample['gt'], gt_type="lines")
            s.append({'name':sample['name'], 'score': score})
            ss.append({'name': sample['name'], 'pred': sample['pred'], 'gt': sample['gt'], 'gt_ind': row_ind, 'pred_ind': col_ind, 'masks':sample['masks']})
        except:
            # https://github.com/scipy/scipy/pull/7031 Need this fix in scipy module.
            # only one case fialing, so ingoring.
            # print(edited_v)
            # print(annot['task6']['output']['visual elements']['lines'])
            print("Failed to caculate the score on " + sample['name'])
            pass
    s = pd.DataFrame(s)
    return s, ss

def handle_arg_errors(args):
    if not Path(args.data_dir).exists():
        raise FileNotFoundError(f"{args.data_dir} does not exist!")
    elif not Path(f"{args.data_dir}/images/").exists():
        raise FileNotFoundError(f"Image Directory {args.img_dir} does not exist!")
    elif not Path(f"{args.data_dir}/annot/").exists():
        raise FileNotFoundError(f"Annotation Directory {args.annot_dir} does not exist!")
    elif not Path(args.model_config).exists():
        raise FileNotFoundError(f"Model config path {args.model_config} does not exist!")
    elif not Path(args.model_ckpt).exists():
        raise FileNotFoundError(f"Model ckpt {args.model_ckpt} does not exist!")


In [4]:
# image_dir = "test_data/image"
# annotations_dir = "test_data/annotation"

image_dir = "../data/test/images"
annotations_dir = "../data/test/annotations"

In [5]:
model_config = "lineformer_swin_t_config.py"
model_ckpt = "./iter_3000.pth"
device = "cpu"

In [6]:
infer.load_model(model_config, model_ckpt, device)

load checkpoint from local path: ./iter_3000.pth


In [7]:
image_files = os.listdir(image_dir)

result = []

for image_file in tqdm(image_files): 
    filename = image_file.split(".")[0]
    image_file_path = os.path.join(image_dir, image_file)
    annotation_file_path = os.path.join(annotations_dir, filename + ".json")
    
    img = mmcv.imread(image_file_path)
    
    try:
        pred_ds, masks = infer.get_dataseries(img,annot=None, to_clean=False,post_proc=True,mask_kp_sample_interval=10, return_masks=True)
        
        annot = mmcv.load(annotation_file_path)
        
        gt_ds = annot['task6']['output']['visual elements']['lines']
        
    except Exception as e:
        print('*'*8, f'Exception occured for: {image_file_path}', '*'*8)
        print('Exception:', e)
        raise
        pred_ds = []
    
    result.append({'name':filename, 'pred': pred_ds, 'gt': gt_ds, 'masks': masks})

100%|██████████| 156/156 [02:40<00:00,  1.03s/it]


In [28]:
len(result[0]['pred']), len(result[0]['gt']), len(result[0]['masks'])

(4, 5, 4)

In [12]:
df_6a, test_6a = get_metric(result, score_func=metric_6a_indv)

In [29]:
len(test_6a[0]['gt']), len(test_6a[0]['pred']), test_6a[0]['name'], len(test_6a[0]['masks']), test_6a[0]['gt_ind'], test_6a[0]['pred_ind']

(47, 411, 'PMC6328065___19', 4, [0, 1, 2, 3], [1, 0, 2, 3])

In [36]:
data_series_values = []

In [37]:
for item in test_6a:
    temp = {}
    gt_ind = item['gt_ind']
    pred_ind = item['pred_ind']
    
    for i in range(len(gt_ind)):
        gt = item['gt'][gt_ind[i]]
        pred = item['pred'][pred_ind[i]]
        
        pred_ds = sorted(pred, key=lambda p: float(p['x']))
        gt_ds = sorted(gt, key=lambda p: float(p['x']))

        p_xs = np.array([float(ds['x']) for ds in pred_ds])
        p_ys = np.array([float(ds['y']) for ds in pred_ds])
        g_xs = np.array([float(ds['x']) for ds in gt_ds])
        g_ys = np.array([float(ds['y']) for ds in gt_ds])
               
        gt_px_val = []
        pred_px_val = []
       
        for j in range(g_xs.shape[0]):
            x = g_xs[j]

            y = g_ys[j]

            y_interp = np.interp(x, p_xs, p_ys)

            gt_px_val.append({'x': x, 'y': y})
            pred_px_val.append({'x': x, 'y': y_interp})

        temp[f'line{i}'] = {'gt_val': gt_px_val, 'pred_val': pred_px_val, 'mask': item['masks'][pred_ind[i]]}

    data_series_values.append({'name': item['name'], 'data': temp})


In [38]:
len(data_series_values[0]['data']['line0']['gt_val']), len(data_series_values[0]['data']['line0']['pred_val'])

(47, 47)

In [39]:
def extreme_left_right_pixel_of_line(mask):

    left = {}
    right = {}
    h, w = len(mask), len(mask[0])

    ## LEFT Extreme pixel
    flag = False
    for j in range(w):
        for i in range(h):
            if mask[i][j] == 255:
                left['x'] = j
                left['y'] = i
                flag = True
                break
        if flag == True:
            break

    ## Right Extreme pixel
    flag = False

    for j in range(w-1, 0 ,-1):
        for i in range(h):
            if mask[i][j] == 255:
                right['x'] = j
                right['y'] = i
                flag = True
                break
        if flag == True:
            break

    return left, right

In [40]:
def tick_block_mapping(ticks, blocks):
    ticks_blocks_mapping = []

    for i in range(len(ticks)):
        tick = ticks[i]
        block = blocks[i]

        item = {}
        item['id'] = tick['id']
        item['x'] = tick['tick_pt']['x']
        item['y'] = tick['tick_pt']['y']
        item['text'] = block['text']

        ticks_blocks_mapping.append(item)

    return ticks_blocks_mapping

In [91]:
def find_min_tick_xy(data):
    xy_ticks_coord = data['task4']['output']['axes']
    xaxis_ticks = xy_ticks_coord['x-axis']
    yaxis_ticks = xy_ticks_coord['y-axis']
    
    sorted_xaxis_ticks = sorted(xaxis_ticks, key=lambda x: x['tick_pt']['x'])
    sorted_yaxis_ticks = sorted(yaxis_ticks, key=lambda x: x['tick_pt']['y'], reverse=True)
    
    return sorted_yaxis_ticks[0]['tick_pt']

In [42]:
def find_text_role(idd, task_3):
    text_roles = None

    for item in task_3['output']['text_roles']:
        if item['id'] == idd:
            text_roles = item['role']
            return text_roles

    return text_roles

In [43]:
def find_xy_ticks(data):
    task_4 = data['task4']
    xy_ticks_coord = task_4['output']['axes']
    xaxis_ticks = xy_ticks_coord['x-axis']
    yaxis_ticks = xy_ticks_coord['y-axis']
    
    return xaxis_ticks, yaxis_ticks

In [44]:
def find_text_blocks(task_2):
    
    text_blocks = task_2['output']['text_blocks']

    xaxis_ticks, yaxis_ticks = find_xy_ticks(data)

    origin_y = xaxis_ticks[0]['tick_pt']['y']
    
    return text_blocks, origin_y

In [45]:
def find_xy_blocks(data):
    task_3 = data['task3']
    task_2 = data['task2']
    text_blocks, origin_y = find_text_blocks(task_2)

    xaxis_blocks, yaxis_blocks = [], []
    
    for block in text_blocks:
        idd = block['id']
        text_role = find_text_role(idd, task_3)
    
        if text_role != 'tick_label':
            continue
    
        y0 = block['polygon']['y0']
    
        if y0 >= origin_y:
            xaxis_blocks.append(block)
        else:
            yaxis_blocks.append(block)
            
    return xaxis_blocks, yaxis_blocks
    

In [46]:
def tick_block_mapping(ticks, blocks):
    ticks_blocks_mapping = []

    for i in range(len(ticks)):
        tick = ticks[i]
        block = blocks[i]

        item = {}
        item['id'] = tick['id']
        item['x'] = tick['tick_pt']['x']
        item['y'] = tick['tick_pt']['y']
        item['text'] = block['text']

        ticks_blocks_mapping.append(item)

    return ticks_blocks_mapping

In [77]:
def find_per_pixel_val(xticks_blocks_mapping, yticks_blocks_mapping):
    
    y_block1 = yticks_blocks_mapping[0]
    y_block2 = yticks_blocks_mapping[1]
    x_block1 = xticks_blocks_mapping[0]
    x_block2 = xticks_blocks_mapping[1]
    
    y_block1_val = y_block1['text'].strip('%').replace(',', '').replace(' ', '')
    y_block2_val = y_block2['text'].strip('%').replace(',', '').replace(' ', '')
    x_block1_val = x_block1['text'].strip('%').replace(',', '').replace(' ', '')
    x_block2_val = x_block2['text'].strip('%').replace(',', '').replace(' ', '')
    
    pix_diff_bw_two_yticks = abs(y_block1['y'] - y_block2['y'])
    pix_diff_bw_two_xticks = abs(x_block1['x'] - x_block2['x'])
    
    val_diff_bw_two_yticks = abs(float(y_block1_val) - float(y_block2_val))
    val_diff_bw_two_xticks = abs(float(x_block1_val) - float(x_block2_val))

    unit_px_val_y = val_diff_bw_two_yticks / pix_diff_bw_two_yticks
    unit_px_val_x = val_diff_bw_two_xticks / pix_diff_bw_two_xticks
    
    return unit_px_val_x, unit_px_val_y

In [48]:
def find_ticks_blocks_mapping(data):
    xaxis_blocks, yaxis_blocks = find_xy_blocks(data)
    xaxis_ticks, yaxis_ticks = find_xy_ticks(data)

    sorted_xaxis_blocks = sorted(xaxis_blocks, key=lambda x: x['polygon']['x0'])
    sorted_yaxis_blocks = sorted(yaxis_blocks, key=lambda x: x['polygon']['y0'], reverse=True)
    sorted_xaxis_ticks = sorted(xaxis_ticks, key=lambda x: x['tick_pt']['x'])
    sorted_yaxis_ticks = sorted(yaxis_ticks, key=lambda x: x['tick_pt']['y'], reverse=True)

    xticks_blocks_mapping = tick_block_mapping(sorted_xaxis_ticks, sorted_xaxis_blocks)
    yticks_blocks_mapping = tick_block_mapping(sorted_yaxis_ticks, sorted_yaxis_blocks)
    
    return xticks_blocks_mapping, yticks_blocks_mapping

In [69]:
data_series_values[0]

{'name': 'PMC6328065___19',
 'data': {'line0': {'gt_val': [{'x': 48.64285714285714, 'y': 251.075},
    {'x': 51.89285714285714, 'y': 264.575},
    {'x': 61.39285714285714, 'y': 273.825},
    {'x': 65.39285714285714, 'y': 246.825},
    {'x': 74.39285714285714, 'y': 122.32499999999999},
    {'x': 79.64285714285714, 'y': 90.07499999999999},
    {'x': 82.89285714285714, 'y': 296.575},
    {'x': 84.39285714285714, 'y': 249.575},
    {'x': 85.39285714285714, 'y': 339.325},
    {'x': 87.14285714285714, 'y': 302.825},
    {'x': 91.64285714285714, 'y': 283.825},
    {'x': 95.39285714285714, 'y': 270.075},
    {'x': 101.14285714285714, 'y': 254.575},
    {'x': 107.64285714285714, 'y': 236.325},
    {'x': 108.64285714285714, 'y': 220.325},
    {'x': 112.89285714285714, 'y': 212.825},
    {'x': 115.39285714285714, 'y': 220.825},
    {'x': 120.89285714285714, 'y': 241.325},
    {'x': 123.64285714285714, 'y': 231.325},
    {'x': 129.64285714285714, 'y': 219.825},
    {'x': 134.14285714285714, 'y': 2

In [95]:
def find_dataseries_value(pred_ds, gt_ds, min_xy_ticks, unit_px_val_x, unit_px_val_y):
    
    pred_data_series = []
    
    
    label = gt_ds['name']
    
    for point in pred_ds:
        x = point['x']
        y = point['y']
        
        width = abs(x - min_xy_ticks['x'])
        height = abs(y - min_xy_ticks['y'])
        
        pred_x = unit_px_val_x* width
        pred_y = unit_px_val_y*height
        
        pred_data_series.append({'x': pred_x, 'y': pred_y})
        
    res = {}
    res['data'] = pred_data_series
    res['name'] = label
    
    return res
        

In [99]:
for i in tqdm(range(len(data_series_values))):
    filename = data_series_values[i]['name']
    
    annotation_file = os.path.join(annotations_dir, filename)
    annotation_file += '.json'
    
    save_pred_json_file = os.path.join("test_data/pred_dataseries/", filename + '.json')
    save_gt_json_file = os.path.join("test_data/gt_dataseries/", filename + '.json')
     
    with open(annotation_file, 'r') as f:
        data = json.load(f)

    dataseries = data['task6']['output']['data series']
    
    try:

        min_xy_ticks = find_min_tick_xy(data)
    
        xticks_blocks_mapping, yticks_blocks_mapping = find_ticks_blocks_mapping(data)
    
        unit_px_val_x, unit_px_val_y = find_per_pixel_val(xticks_blocks_mapping, yticks_blocks_mapping)
        
        num_of_lines = len(data_series_values[i]['data'])
        
        pred_dataseries = []

        for line in range(num_of_lines):
            pred_ds = data_series_values[i]['data'][f'line{line}']['pred_val']
            gt_ds = dataseries[line]
            
            pred_line_val = find_dataseries_value(pred_ds, gt_ds, min_xy_ticks, unit_px_val_x, unit_px_val_y)
            
            pred_dataseries.append(pred_line_val)

        with open(save_pred_json_file, "w") as json_file:
            json.dump(pred_dataseries, json_file, indent=4)
            
        with open(save_gt_json_file, "w") as json_file:
            json.dump(dataseries, json_file, indent=4)
                   
    except Exception as e:
        print(f"Error: {filename}, {e}")
    
      
    

100%|██████████| 156/156 [00:00<00:00, 770.37it/s]

Error: PMC6220549___4, could not convert string to float: '2013Q4'
Error: PMC6315182___g0011, could not convert string to float: '10^{-1}'
Error: PMC6210558___11, could not convert string to float: 'ESD'
Error: PMC5943383___3_HTML, could not convert string to float: '20:36:04'
Error: PMC6035401___3_HTML, could not convert string to float: 'Top-10'
Error: PMC6052132___10, list index out of range
Error: PMC6035401___2_HTML, could not convert string to float: 'AUC50'
Error: PMC6213194___10, could not convert string to float: '10^{-13}'
Error: PMC3812033___1, list index out of range
Error: PMC6314626___g013, could not convert string to float: 'Pre-fight'
Error: PMC6220549___2, could not convert string to float: '2013Q4'
Error: PMC5882956___5_HTML, list index out of range
Error: PMC3324550___g004, list index out of range
Error: PMC6027332___2, list index out of range
Error: PMC5982203___13, could not convert string to float: 'range0'
Error: PMC3338675___g007, could not convert string to flo


