In [1]:
import os
from glob import glob
from openai import OpenAI
import json
import pandas as pd 
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import concurrent.futures
import re
import numpy as np
from fuzzywuzzy import fuzz
import xml.etree.ElementTree as ET



model = "gpt-4o"

client = OpenAI()
def get_response(prompt):
  response = client.chat.completions.create(
    model=model,
    messages= [
        {
            "role": "user",
            "content": prompt
        }],
    response_format={ "type": "json_object"},
    temperature = 0
  )
  return json.loads(response.choices[0].message.content)


def get_table_df(tab_dict):

    max_rows, max_cols = tab_dict["row_count"], tab_dict["column_count"]

    table_list = [[" " for j in range(max_cols)] for i in range(max_rows)]

    for cell in tab_dict["cells"]:

        column_index = cell["column_index"]
        row_index = cell["row_index"]

        table_list[row_index][column_index] = cell["content"]

    df_table = pd.DataFrame(table_list)
    df_table = df_table.to_string(index=False)

    return '<table> '+df_table +' </table>'

def perform_ocr(image_data):
    '''Function to perform OCR'''
    endpoint = os.environ["AZURE_OCR_ENDPOINT"]
    key = os.environ["AZURE_OCR_KEY"]
    credential = AzureKeyCredential(key)
    document_analysis = DocumentAnalysisClient(endpoint = endpoint,
                                            credential = credential)
    poller = document_analysis.begin_analyze_document("prebuilt-layout", image_data)
    result = poller.result()
    return result


def binary_search(arr, target):
    left = 0
    right = len(arr) - 1
    index = -1

    while left <= right:
        mid = (left + right) // 2

        if arr[mid] == target:
            return mid

        elif arr[mid] > target:
            index = mid
            right = mid - 1

        else:
            left = mid + 1

    return index


def get_text(text_feilds):

    string = ""

    for i in range(len(text_feilds)):

        if text_feilds[i]["role"] in ["title", "sectionHeading", "pageHeader"]:
            string += "<H1> " + text_feilds[i]["content"] + " </H1> "

        else:
            string += text_feilds[i]["content"]

        string += "\n"

    return string

def get_docs(input_file_name, folder_path = ""):

  support_doc_paths = "{folder_path}/*.png".format(folder_path = folder_path)
  claim_doc_path = "{folder_path}/{input_file_name}".format(input_file_name = input_file_name, folder_path = folder_path)

  claim_doc_txt, claim_doc_contents = process_files_in_parallel([claim_doc_path])

  support_doc_paths = list(set(glob(support_doc_paths)) - set(glob(claim_doc_path)))
  support_docs_txt, support_docs_contents = process_files_in_parallel(support_doc_paths)

  return claim_doc_txt, support_docs_txt, support_docs_contents

def get_df_string(temp_df):

    temp_str_list = temp_df.to_csv(header=True, index=True).strip('\n').split('\n')

    str_final = ""

    for s in temp_str_list:
        str_final += s
        str_final += "\n"

    return str_final

def filter_by_offset(text_feilds, tables):

    output = []

    text_offsets = []
    for txt_dic in text_feilds:
        text_offsets.append((txt_dic["spans"][0]["offset"]))

    prev_tab_end = -1

    if len(tables) == 0:

        texts_enclosed = []

        str_text = get_text(text_feilds)

        output.append(str_text)


    else:

        for tab in tables:

            spans = tab["spans"]
            spans_sorted = sorted(spans, key=lambda x: x['offset'])


            for span in spans_sorted:
                curr_tab_offset = span["offset"]
                curr_tab_end = curr_tab_offset + span["length"]

                idx_start_tab = binary_search(text_offsets, curr_tab_offset)
                idx_end_tab = binary_search(text_offsets, curr_tab_end)

                si = prev_tab_end + 1
                ei = idx_start_tab

                texts_enclosed = []

                for i in range(si, ei):
                    texts_enclosed.append(text_feilds[i])

                str_text = get_text(texts_enclosed)

                output.append(str_text)

                prev_tab_end = idx_end_tab


            output.append(get_table_df(tab))


        global_max_ind = len(text_offsets)-1

        if prev_tab_end == -1:

            final_str = ""

            for op_item in output:

                if not isinstance(op_item, str):

                    str_df = get_df_string(op_item)

                    final_str += str_df


                else:

                    final_str += op_item

                final_str += "\n"

            return final_str


        si = prev_tab_end
        ei = global_max_ind

        texts_enclosed = []

        for i in range(si, ei+1):
            texts_enclosed.append(text_feilds[i])

        str_text = get_text(texts_enclosed)

        output.append(str_text)



    final_str = ""

    for op_item in output:

        if not isinstance(op_item, str):

            str_df = get_df_string(op_item)

            final_str += str_df

        else:

            final_str += op_item

        final_str += "\n"

    return final_str


def extract_lines(ocr_result, page_num):

    lines = []

    for idx, page in enumerate(ocr_result.pages):
      width = ocr_result.pages[idx].width
      height = ocr_result.pages[idx].height
      for line in page.lines:
        line_bbox = [(int(p.x), int(p.y)) for p in line.polygon]
        start_x = line_bbox[0][0]*100/width
        start_y = line_bbox[0][1]*100/height
        end_x = line_bbox[1][0]*100/width
        end_y = line_bbox[2][1]*100/height
        lines.append((page_num, line.content, start_x, start_y, end_x, end_y, line.spans[0].offset, line.spans[0].length,width, height))

    return lines


def get_df_lines(text_lines):

    df_result = pd.DataFrame(text_lines, columns=["page", "line", "line_start_x", 
                                                          "line_start_y", "line_end_x", "line_end_y", 
                                                          "offset", "length", "width", "height"])

    df_result = df_result.sort_values(by=["page", "line_start_y", "line_start_x"])

    df_result["line_no"] = df_result["offset"].rank()

    df_result["content"] = df_result.apply(lambda row: f'{row["page"]}||{row["line_no"]}||{row["line"]}',
                                                    axis=1)    
    return df_result



def get_outputs_processed(file_path, page_num):

    with open(file_path, 'rb') as fopen:
        file = fopen.read()

    document_analysis_client = DocumentAnalysisClient(
    endpoint = os.environ["AZURE_OCR_ENDPOINT"], credential=AzureKeyCredential(os.environ["AZURE_OCR_KEY"])
    )

    poller = document_analysis_client.begin_analyze_document("prebuilt-layout", file)
    result = poller.result()

    resutls_dict = result.to_dict()

    text_feilds = resutls_dict["paragraphs"]
    tables = resutls_dict["tables"]

    op = filter_by_offset(text_feilds, tables)

    text_lines = extract_lines(result, page_num)

    df_result = get_df_lines(text_lines)

    return op, df_result


def natural_sort_key(s):
    """Key function for natural sorting."""
    return [int(text) if text.isdigit() else text.lower() for text in re.split('(\d+)', s)]

def process_files_in_parallel(file_paths):
    sorted_file_paths = sorted(file_paths, key=natural_sort_key)


    page_mapping = {}
    i = 0
    for img_path in sorted_file_paths:
        page_num = i
        page_mapping[page_num] = img_path
        i += 1

    page_mapping = dict(sorted(page_mapping.items()))
    print("page_mapping = ", page_mapping)
    
    # Use ThreadPoolExecutor to run get_outputs_processed in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_index = {executor.submit(get_outputs_processed, file, page_num): page_num for page_num, file in page_mapping.items()}
        
        results = [None] * len(sorted_file_paths)
        for future in concurrent.futures.as_completed(future_to_index):
            page_num = future_to_index[future]
            try:
                op, df_result = future.result()
                results[page_num] = (op, df_result)
            except Exception as exc:
                print(f'File at index {page_num} generated an exception: {exc}')
                results[page_num] = (None, pd.DataFrame())  # Maintain order by adding empty results in case of exception

    ops = [res[0] for res in results]
    dfs = [res[1] for res in results]

    op_text = '\n'.join(str(op) for op in ops if op is not None)

    page_contents = []

    for pgnum in range(len(dfs)):
        width = dfs[pgnum].iloc[0]["width"]
        height = dfs[pgnum].iloc[0]["height"]
        page_contents.append({"page_no" : pgnum+1, "df": dfs[pgnum], "content":ops[pgnum], "file_path":sorted_file_paths[pgnum], "width":width, "height": height})

    return op_text, page_contents

def find_similar_lines_to_context(context_sentence, lines):
    
    '''Method to find similar lines to context in the chunk'''
    scores = [fuzz.ratio(context_sentence.lower(),line.lower()) for line in lines]

    sim_idx = np.argmax(scores)
    max_score = max(scores)

    min_idx = max(0, sim_idx)

    while min_idx >= 0:
        if scores[min_idx] > 60:
            min_idx-=1
            
        else:
            break

    max_idx = min(sim_idx+1, len(lines)-1)
    while max_idx < len(scores):
        if scores[max_idx] > 60:
            max_idx+=1
            
        else:
            break

    sim_lines = lines[min_idx: min(len(lines), max_idx+1)]

    sim_page_lines = list(map(lambda x: x.split("||")[:2], sim_lines))

    return sim_page_lines, max_score


def get_context_bbox(pages_df, context_sentence, lines):
    '''Method to get context bbox'''

    sim_page_lines, max_score = find_similar_lines_to_context(context_sentence, lines)

    df_context = pd.DataFrame()
    for page_ln in sim_page_lines:
        df_temp = pages_df.loc[(pages_df["page"] == float(page_ln[0])) & (pages_df["line_no"] == float(page_ln[1]))]
        df_context = pd.concat([df_context, df_temp])


    if df_context.shape[0] > 0:
        df_context["bbox"] = df_context.apply(
            lambda row: [
                row["line_start_x"],
                row["line_start_y"],
                row["line_end_x"],
                row["line_start_y"],
                row["line_end_x"],
                row["line_end_y"],
                row["line_start_x"],
                row["line_end_y"],
            ],
            axis=1,
        )


        df_context = df_context.sort_values(by=["page", "line_start_y", "line_start_x"])
        return (
            df_context["bbox"].values.tolist(),
            df_context["page"].tolist(),
            df_context["line_no"].tolist(),
            max_score
        )
    return [[0] * 8], [0], [0], max_score


def get_evidence_bbox(supporting_sentence, page_contents):

    pages_bbox_scores = []

    pages_bboxes = []

    for i in range(len(page_contents)):

        curr_pg_df = page_contents[i]["df"]

        curr_pg_lines = page_contents[i]["df"]["content"]


        bboxes_op = get_context_bbox(curr_pg_df, supporting_sentence, curr_pg_lines)

        pages_bbox_scores.append(bboxes_op[-1])
        pages_bboxes.append(bboxes_op[0])

    max_pg = np.argmax(pages_bbox_scores)

    max_page_contents = page_contents[max_pg]

    file_path = max_page_contents["file_path"]

    width = max_page_contents["width"]
    height = max_page_contents["height"]

    max_bbox = pages_bboxes[max_pg]

    print("pages_bbox_scores = ", pages_bbox_scores)
    print(file_path)

    return file_path, max_bbox, max_pg+1, width, height

def get_response_claim(support_docs, claim_doc, user_prompt = 'here is a claim information where the claim for DME was denied because payer needed additional information: '):

  prompt = user_prompt + claim_doc + ' Here are a set of supporting documents: ' + support_docs + ' Please give me the top 10 reasons (most important being the first and then the sorted rank order) why this claim should not be denied based on the supporting documents as a JSON. The JSON keys will be Reason, supporting_sentence, type_of_document for each of the 5 reasons. DONOT paraphrase or change the wording of the supporting sentences, return them as it is. Ensure that supporting_sentence contains the exact wordings from the supporting documents. The type of document can be any of '+'''Prescription/Order Certificate of Medical Necessity (CMN) Proof of Delivery (POD) Letter of Medical Necessity (LMN) Explanation of Benefits (EOB) Physician's Notes/Progress Reports Letter of Authorization (LOA) Invoice/Statement Referral Form ABN Test results'''
  
  return get_response(prompt)


def convert_bbox_format(bbox):
    # Unpack the bounding box coordinates
    x1, y1, x2, y2, x3, y3, x4, y4 = bbox
    
    # Calculate the top-left corner
    x = x1
    y = y1
    
    # Calculate width and height
    width = x2 - x1
    height = y4 - y1
    
    # Create the dictionary in the desired format
    bbox_dict = {
        "x": x,
        "y": y,
        "width": width,
        "height": height
    }
    
    return bbox_dict

def get_bbox_label_studio(max_bbox):

    new_bbox = []


    for bbox in max_bbox:
        converted_bbox = convert_bbox_format(bbox)
        new_bbox.append(converted_bbox)

    return new_bbox

def get_appeals_evidence(reason_dict_list, support_docs_contents):

    n = len(reason_dict_list)

    evidences = []


    for k in range(n):
        supporting_sentence = reason_dict_list[k]["supporting_sentence"]
        supporting_reason = reason_dict_list[k]["Reason"]


        image_file_path, max_bbox, page_num, orig_width, orig_height = get_evidence_bbox(supporting_sentence, support_docs_contents)

        bbox_final = get_bbox_label_studio(max_bbox)

        evidences.append({
            "supporting_sentence" : supporting_sentence,
            "supporting_reason" : supporting_reason,
            "image_file_path" : image_file_path,
            "bbox" : bbox_final,
            "page_num" : page_num,
            "orig_width": orig_width, 
            "orig_height" : orig_height
        })


    return evidences


def get_results_formatted(results):

    op = []

    file_paths = []
    
    i = 0
    for res1 in results:

        result = res1["res"]
        
        res = {
            "id" : "result_" + str(i+1),
            "original_width": result["orig_width"],
            "original_height": result["orig_height"],
            "image_rotation": 0,
            "from_name": "label",
            "to_name": "image",
            "type": "rectanglelabels",
            "value": {
                "x": result["x"],
                "y": result["y"],
                "width": result["width"],
                "height": result["height"],
                "rotation": 0,
                "rectanglelabels": [
                        res1["supporting_reason"]
                    ]
            }
        }


        

        

        op.append(res)
        file_path = res1["image_file_path"]
        file_paths.append(file_path)


        i += 1
        

    return (op,file_paths)








In [2]:
access_key = os.environ["LABEL_STUDIO_ACCESS_KEY"]

In [3]:
import os
# from s3_upload import create_presigned_post, generate_presigned_url, copy_file_to_s3
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError


  

In [4]:
def copy_file_to_s3(source_file_path, bucket_name, destination_path, expiration=900):
    s3_client = boto3.client(
        "s3",
        aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
        aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
        region_name=os.getenv("AWS_REGION_NAME", "us-east-2"),
    )
    try:
        s3_client.upload_file(source_file_path, bucket_name, destination_path)
    except ClientError as e:
        print("Error copying file to S3:", e)


### Getting the reasons and supporting sentecnes 

In [5]:
%%time

# folder_path = "Doc_imgs/new_files/betty"
# folder_path = "Doc_imgs/new_files/gregory"
# folder_path = "Doc_imgs/new_files/daniel"
# folder_path = "Doc_imgs/new_files/osky"
# folder_path = "Doc_imgs/new_files/april"

folder_path = "Doc_imgs/new_files/tedlock"


folder_name = folder_path.split("/")[-1]

claim_doc_txt, support_docs_txt, support_docs_contents = get_docs(input_file_name = "denial.png",
                                           folder_path=folder_path
                                           )


user_prompt = 'here is a claim information where the claim was denied because of lack of medical necessity, contract dispute & the HCPCS under contention are: A4223,B9999,A4245,A6219,A6457,A4927,S1015, A4209,E0776 NU '

reason_dict = get_response_claim(support_docs=support_docs_txt, claim_doc=claim_doc_txt, user_prompt=user_prompt)

reason_dict_list = reason_dict[list(reason_dict.keys())[0]]

evidences = get_appeals_evidence(reason_dict_list, support_docs_contents)

page_mapping =  {0: 'Doc_imgs/new_files/tedlock/denial.png'}
page_mapping =  {0: 'Doc_imgs/new_files/tedlock/1.png', 1: 'Doc_imgs/new_files/tedlock/2.png', 2: 'Doc_imgs/new_files/tedlock/3.png', 3: 'Doc_imgs/new_files/tedlock/4.png', 4: 'Doc_imgs/new_files/tedlock/5.png', 5: 'Doc_imgs/new_files/tedlock/6.png', 6: 'Doc_imgs/new_files/tedlock/7.png', 7: 'Doc_imgs/new_files/tedlock/8.png', 8: 'Doc_imgs/new_files/tedlock/9.png', 9: 'Doc_imgs/new_files/tedlock/10.png', 10: 'Doc_imgs/new_files/tedlock/11.png', 11: 'Doc_imgs/new_files/tedlock/12.png', 12: 'Doc_imgs/new_files/tedlock/13.png', 13: 'Doc_imgs/new_files/tedlock/14.png', 14: 'Doc_imgs/new_files/tedlock/15.png', 15: 'Doc_imgs/new_files/tedlock/16.png', 16: 'Doc_imgs/new_files/tedlock/17.png'}
pages_bbox_scores =  [15, 25, 25, 25, 15, 31, 23, 21, 31, 26, 28, 30, 35, 26, 71, 19, 25]
Doc_imgs/new_files/tedlock/15.png
pages_bbox_scores =  [20, 30, 26, 26, 14, 26, 22, 24, 24, 23, 23, 20, 65, 28, 62, 24, 24]
Doc_imgs/new_files/tedlock/13.

In [6]:
folder_name

'tedlock'

In [7]:
i = 0

results = []

for evid in evidences:

    res_id = i

    curr_bbox = evid["bbox"]

    curr_oh = evid["orig_height"]
    curr_ow = evid["orig_width"]


    for bbox_dic in evid["bbox"]:
        temp_dict = {
            "res_id" : res_id,
            "x" : bbox_dic["x"],
            "y" : bbox_dic["y"],
            "width" : bbox_dic["width"],
            "height" : bbox_dic["height"],
            "orig_height" : curr_oh,
            "orig_width" : curr_ow
        }

    
        results.append({
            "supporting_sentence" : evid["supporting_sentence"],
            "supporting_reason" : evid["supporting_reason"],
            "image_file_path" : evid["image_file_path"], 
            "page_num" : evid["page_num"], 
            "res" : temp_dict
        })

    i += 1


In [8]:
appeal_reasons_list = []

for res in results:
    appeal_reasons_list.append(res["supporting_reason"])

appeal_reasons_set = set(appeal_reasons_list)

In [9]:
appeal_reasons_set = list(appeal_reasons_set)

In [10]:
appeal_reasons_set

["Physician's Recommendation",
 'Medical Necessity for Trilogy Humidifier',
 'Proof of Delivery',
 'Detailed Hospital Course',
 'Medical Necessity for Non-Invasive Ventilator',
 'Diagnosis Supporting Medical Necessity',
 'Supporting Imaging Results',
 'Supporting Lab Results',
 'Prescription for Equipment',
 'Length of Need']

In [11]:
reason_color = [
    ("reason1", "#D9534F"),  # darker shade of red
    ("reason2", "#E67E22"),  # darker shade of orange
    ("reason3", "#F39C12"),  # darker shade of yellow-orange
    ("reason4", "#F4D03F"),  # darker shade of yellow
    ("reason5", "#5DADE2"),  # darker shade of sky blue
    ("reason6", "#48C9B0"),  # darker shade of teal
    ("reason7", "#1ABC9C"),  # darker shade of medium turquoise
    ("reason8", "#16A085"),  # darker shade of green-blue
    ("reason9", "#27AE60"),  # darker shade of green
    ("reason10", "#2ECC71"), # darker shade of emerald green
    ("reason11", "#2980B9"), # darker shade of blue
    ("reason12", "#8E44AD"), # darker shade of purple
    ("reason13", "#9B59B6"), # darker shade of amethyst
    ("reason14", "#34495E"), # darker shade of blue-gray
    ("reason15", "#2C3E50"), # darker shade of midnight blue
]

reasons = []

for idx,reason in enumerate(appeal_reasons_set):
    reasons.append((reason, reason_color[idx][1]))

In [12]:
xml_content = '''<View>
  <Image name="image" value="$image"/>
  <RectangleLabels name="label" toName="image">
    {labels}
  </RectangleLabels>
</View>'''

# Generate the Label elements dynamically
label_elements = ""
for reason, color in reasons:
    label_element = f'<Label value="{reason}" background="{color}"/>'
    label_elements += label_element + "\n    "

# Format the XML with the generated Label elements
final_xml_content = xml_content.format(labels=label_elements.strip())


In [13]:
from pprint import pprint 

print(final_xml_content)

<View>
  <Image name="image" value="$image"/>
  <RectangleLabels name="label" toName="image">
    <Label value="Physician's Recommendation" background="#D9534F"/>
    <Label value="Medical Necessity for Trilogy Humidifier" background="#E67E22"/>
    <Label value="Proof of Delivery" background="#F39C12"/>
    <Label value="Detailed Hospital Course" background="#F4D03F"/>
    <Label value="Medical Necessity for Non-Invasive Ventilator" background="#5DADE2"/>
    <Label value="Diagnosis Supporting Medical Necessity" background="#48C9B0"/>
    <Label value="Supporting Imaging Results" background="#1ABC9C"/>
    <Label value="Supporting Lab Results" background="#16A085"/>
    <Label value="Prescription for Equipment" background="#27AE60"/>
    <Label value="Length of Need" background="#2ECC71"/>
  </RectangleLabels>
</View>


In [14]:
access_key = "0c5929f305c0d4987452bde11aabb7ed72381b88"

from label_studio_sdk.client import LabelStudio

client = LabelStudio(
    api_key=os.environ["LABEL_STUDIO_ACCESS_KEY"],
    base_url = os.environ["LABEL_STUDIO_BASE_URL"]
)

In [15]:
import os

# Specify the folder path

# List to store relative file paths
png_files = []
png_filenames = []

# Iterate through all files in the directory
for filename in os.listdir(folder_path):
    # Check if the file is a PNG and is not "denial.png"
    if filename.endswith('.png') and filename != 'denial.png':
        # Add the relative path to the list
        png_files.append(os.path.join(folder_path, filename))
        png_filenames.append(filename)


In [16]:
from label_studio_sdk.client import LabelStudio


# folder_name = "betty"

resp_project = client.projects.create(title=folder_name, 
                     description= folder_name,
                     label_config = final_xml_content
                    )

project_id = resp_project.id

In [17]:
from concurrent.futures import ThreadPoolExecutor, as_completed

filename_task_mappings = []

def process_file(i):
    file_name = png_filenames[i]
    file_path = png_files[i]
    project_name = folder_name

    copy_file_to_s3(
        source_file_path=file_path,
        bucket_name=os.environ["AWS_BUCKET_NAME"],
        destination_path=f"{project_name}/{file_name}"
    )

    print("pushed :- ", file_name)
    filename_task_mappings.append((file_name, file_path))

# Use ThreadPoolExecutor to run the tasks in parallel
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_file, i) for i in range(len(png_filenames))]

    for future in as_completed(futures):
        try:
            future.result()  # This ensures that any exceptions are raised
        except Exception as e:
            print(f"Error processing file: {e}")



pushed :-  13.png
pushed :-  16.png
pushed :-  17.png
pushed :-  15.png
pushed :-  8.png
pushed :-  11.png
pushed :-  10.png
pushed :-  4.png
pushed :-  12.png
pushed :-  9.png
pushed :-  5.png
pushed :-  14.png
pushed :-  2.png
pushed :-  7.png
pushed :-  1.png
pushed :-  6.png
pushed :-  3.png


In [18]:
project_name = folder_name

In [19]:
resp = client.import_storage.s3.create(use_blob_urls=True, 
                                project=project_id,
                                prefix=project_name,
                                bucket = os.environ["AWS_BUCKET_NAME"],
                                aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"],
                                aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"],
                                region_name = os.environ["AWS_REGION_NAME"])
storage_id = resp.id
print(f"Storage ID: {storage_id}")

Storage ID: 12


In [20]:
%%time 

# 3. sync storage

resp = client.import_storage.s3.sync(
    id=storage_id,
)

CPU times: user 3.05 ms, sys: 1.4 ms, total: 4.46 ms
Wall time: 781 ms


In [21]:
resp.status

'completed'

In [22]:
filename_task_mappings = {}

response = client.tasks.list(project=project_id)
for item in response:

    filename = item.storage_filename

    fns = filename.split("/")
    key = fns[0] + "_" + fns[1]

    filename_task_mappings[key] = item.id
    # print(item.id)
    # print(item.storage_filename)

In [23]:
filename_task_mappings

{'tedlock_1.png': 245,
 'tedlock_10.png': 246,
 'tedlock_11.png': 247,
 'tedlock_12.png': 248,
 'tedlock_13.png': 249,
 'tedlock_14.png': 250,
 'tedlock_15.png': 251,
 'tedlock_16.png': 252,
 'tedlock_17.png': 253,
 'tedlock_2.png': 254,
 'tedlock_3.png': 255,
 'tedlock_4.png': 256,
 'tedlock_5.png': 257,
 'tedlock_6.png': 258,
 'tedlock_7.png': 259,
 'tedlock_8.png': 260,
 'tedlock_9.png': 261}

In [24]:
# file_name = "Doc_imgs/new_files/susan/9.png"

def get_task_mapping(filename_task_mappings, file_name):

    key = "_".join(file_name.split("/")[-2:])

    return filename_task_mappings[key]

In [25]:
i = 0

results = []

for evid in evidences:

    res_id = i

    curr_bbox = evid["bbox"]

    curr_oh = evid["orig_height"]
    curr_ow = evid["orig_width"]


    for bbox_dic in evid["bbox"]:
        temp_dict = {
            "res_id" : res_id,
            "x" : bbox_dic["x"],
            "y" : bbox_dic["y"],
            "width" : bbox_dic["width"],
            "height" : bbox_dic["height"],
            "orig_height" : curr_oh,
            "orig_width" : curr_ow
        }

    
        results.append({
            "supporting_sentence" : evid["supporting_sentence"],
            "supporting_reason" : evid["supporting_reason"],
            "image_file_path" : evid["image_file_path"], 
            "page_num" : evid["page_num"], 
            "res" : temp_dict
        })

    i += 1


In [26]:
supporting_sent_list = []

for res in results:
    supporting_sent_list.append(res["supporting_sentence"])

supporting_sent_set = set(supporting_sent_list)

evidence_dict = dict()
for res in results:

    if res["supporting_sentence"] in evidence_dict:
        evidence_dict[res["supporting_sentence"]].append(res)

    else:
        evidence_dict[res["supporting_sentence"]] = [res]
            

In [27]:
annotations_dict = {}

for k in evidence_dict:

    curr_results, file_paths = get_results_formatted(evidence_dict[k])

    annotations_dict[k] = [curr_results, file_paths]


In [28]:

refined_annotations_dict = {}


for k in annotations_dict:

    file_paths = annotations_dict[k][1]

    file_names = file_paths[0]


    for idx , fp in enumerate(file_paths):
        
        key = "_".join(fp.split("/")[-2:])

        if key not in refined_annotations_dict:
            refined_annotations_dict[key] = [annotations_dict[k][0][idx]]

        else:

            refined_annotations_dict[key].append(annotations_dict[k][0][idx])



for k in refined_annotations_dict:

    for idx, res in enumerate(refined_annotations_dict[k]):

        res_id = "result_" + str(idx+1)

        refined_annotations_dict[k][idx]["id"] = res_id


  





In [30]:
filename_task_mappings

{'tedlock_1.png': 245,
 'tedlock_10.png': 246,
 'tedlock_11.png': 247,
 'tedlock_12.png': 248,
 'tedlock_13.png': 249,
 'tedlock_14.png': 250,
 'tedlock_15.png': 251,
 'tedlock_16.png': 252,
 'tedlock_17.png': 253,
 'tedlock_2.png': 254,
 'tedlock_3.png': 255,
 'tedlock_4.png': 256,
 'tedlock_5.png': 257,
 'tedlock_6.png': 258,
 'tedlock_7.png': 259,
 'tedlock_8.png': 260,
 'tedlock_9.png': 261}

In [31]:
refined_annotations_dict.keys()

dict_keys(['tedlock_15.png', 'tedlock_13.png', 'tedlock_1.png', 'tedlock_2.png', 'tedlock_9.png', 'tedlock_4.png', 'tedlock_16.png'])

In [32]:
for k in refined_annotations_dict:

    print("Annotating for page :-")
    print(k)
    print()

    task_id = filename_task_mappings[k]


    resp = client.annotations.create(
    id = task_id,
    project = project_id, 
    result=refined_annotations_dict[k],
    was_cancelled=False,
    ground_truth=True,
    
    )



Annotating for page :-
tedlock_15.png

Annotating for page :-
tedlock_13.png

Annotating for page :-
tedlock_1.png

Annotating for page :-
tedlock_2.png

Annotating for page :-
tedlock_9.png

Annotating for page :-
tedlock_4.png

Annotating for page :-
tedlock_16.png

