 Testing output from an api run


In [None]:
from data_models import Entity, Labels
import rich, json
from openai import AzureOpenAI
import azure.keyvault.secrets as azk
from azure.identity import DefaultAzureCredential
import os, asyncio

In [None]:
def get_key(keyname):
    vault_uri = 'https://keyvault-labeling.vault.azure.net/'
    client = azk.SecretClient(vault_uri, DefaultAzureCredential())
    secret = client.get_secret(keyname)
    key = secret.value
    return key

async def send_request(filepath:str):
    with open(filepath,'r') as f:
        contents = f.read().replace("\n", " ").replace("\t", " ").replace("\r", " ")

    key = get_key('openai-key')
    endpoint = 'https://labeling-llm-0.openai.azure.com/openai/deployments/gpt-4.1-nano/chat/completions?api-version=2025-01-01-preview'
    client = AzureOpenAI(
    azure_endpoint = endpoint,
    api_key=key,
    api_version="2024-12-01-preview"
    )
    completion = client.beta.chat.completions.parse(
        model="gpt-4.1-nano",
        messages=[
            {
                "role": "system",
                "content": """You are a Named Entity Recognition (NER) model used as an advanced ATS scanner.
                Your job is to extract words matching the specified entity types; soft skill, Capability, Personality trait, Job title. 
                Skip words not matching any of these types with well enough accuracy that a human would categorise them the same.
                Only assign labels if youre certain that they are relevant in the job market and fall under one of specified categories with well enough accuracy.
                Do not reformat or translate the text you find in the original document in any way.
                """,
            },
            {
                "role": "user",
                "content": contents,
            }
        ],
        response_format=Labels,
        n=1,

    )
    
    message = completion.choices[0].message
    if (message.refusal):
        print(f'error in request at file {filepath}:\n{message.refusal}\n\n{message}')
        return 0
    else:
        labels = Labels.model_validate(message.parsed).model_dump()['Entities']
        return (contents, labels)


obj = await send_request(f'{os.getcwd()}/DATA/data_cs/1.txt')
contents = obj[0].replace("\n", " ").replace("\t", " ").replace("\r", " ") # type: ignore
labels = obj[1] # type: ignore
rich.print(obj)


In [7]:
from label_models import Label, Document

In [None]:
def format_entities(doc_contents:str,labels_list:list,doc_id:int=0):
    doc = doc_contents.replace("\n", " ").replace("\t", " ").replace("\r", " ")
    entities = labels_list
    labeled_ents = []
    for entity in entities:
        content = entity['text']
        label = entity['type']
        if content not in doc_contents:
            continue
        else:
            start = doc_contents.find(content)
            end = start + len(content)
            labeled_ent = Label(start,end,label,content).to_dict()
            labeled_ents.append(labeled_ent)
    labeled_doc = Document(doc_id,doc_contents, labeled_ents).to_dict()
    return labeled_doc

test_doc = format_entities(contents,labels)
rich.print(test_doc)

In [None]:
import json


def check_duplicates(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    for entry in data:
        if "labels" in entry and isinstance(entry["labels"], list):
            labels = entry["labels"]
            seen = set()
            duplicates = []
            
            for label in labels:
                label_tuple = tuple(label.items())  # Convert dict to tuple for hashable comparison
                if label_tuple in seen:
                    duplicates.append(label)
                else:
                    seen.add(label_tuple)
            
            if duplicates:
                print(f"Duplicates found in entry ID {entry.get('id')}:")
                for duplicate in duplicates:
                    print(duplicate)

check_duplicates(f'{os.getcwd()}/DATA/labels_en_duplicates.json')


In [None]:
with open(f'{os.getcwd()}/DATA/labels_en_duplicates.json', 'r') as f:
    data = json.loads(f.read())

new_data = []
for d in data:
    debug_id = d["id"]
    doc = d["content"]
    ents = d["labels"]
    new_ents = []
    seen = set()
    
    for ent in ents:
        start, end, label, value = ent["start"], ent["end"], ent["label"], ent["value"]
        
        label_tuple = (start, end, label, value)
        
        if label_tuple in seen:
            continue 
        else:
            seen.add(label_tuple) 
    
    new_data.append(Document(debug_id, doc, new_ents).to_dict())


jsonfile = json.dumps(new_data, indent=4)
with open('labels_en.json', 'w') as j:
    j.write(jsonfile)

In [None]:
check_duplicates('labels_en.json')