In [1]:
import re
import spacy
import json
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

In [2]:
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Function to add custom rules based on the synonyms in JSON
def add_custom_rules():
    with open("OCR_raw_samples/X1.json", "r") as file:
        data = json.load(file)
        patterns = []
        for entry in data:
            for synonym in entry["Synonyms"]:
                patterns.append({"label": "TEST", "pattern": synonym})
        # print(patterns)
        ruler.add_patterns(patterns)

# Load custom rules
add_custom_rules()

In [3]:
# file_path = "OCR_raw_samples/0b8706dc-c9af-4c6b-887d-2f85b5a511e7.txt"
file_path = "OCR_raw_samples/0ab9800e-bc9a-4388-aaa2-d4fc05e7d111.txt"

with open(file_path, 'r') as file:
    # text = file.read()
    data = file.read().replace("\n", " ")
    # text = clean_text(text)
doc = nlp(data)

In [4]:
# Initialize a dictionary to store drug parameters, values, and units
drug_data = {}

for ent in doc.ents:
    if ent.label_ == "TEST":
        # Start a new entry for this parameter if it doesn't exist
        if ent.text not in drug_data:
            drug_data[ent.text] = {"values": [], "units": []}
        # Focus on this parameter for subsequent values and units
        current_parameter = ent.text

    elif ent.label_ == "CARDINAL" and "current_parameter" in locals():
        # Append value to the current parameter
        drug_data[current_parameter]["values"].append(ent.text)

    elif ent.label_ == "ORG" and "current_parameter" in locals():
        # Append unit to the current parameter
        drug_data[current_parameter]["units"].append(ent.text)

# To display the organized drug data
print(drug_data)

{'PRL': {'values': ['125', '125', '125', '18 10', '18', '125', '24/09/1932', '18 10', '18', '1.5-9.9', '8.3', '9.0', '125', '24/09/1932', '125', '24/09/1932', '7', '18 10', '18', '4.1', '3.4', '125', '24/09/1932', '19.9', '7', '18 10', '18', '6.0', '7.0', '125', '24/09/1932', '125', '125', '125', '7', '18 10', '18', '15', '13', '125', '24/09/1932', '125', '24/09/1932', '12', '18 10', '18', '125', '24/09/1932', '125', '24/09/1932', '17', '6.0', '4.0', '30', '125', '24/09/1932', '15.3', '7', '18 10', '18', '18', '125', '24/09/1932', '6', '125', '24/09/1932', '12', '18 10', '18', '125', '24/09/1932', '12', '18 10', '18', '125'], 'units': ['PTH', 'ESR', 'CRP', 'CEA', 'C19', 'C15', 'FE, FBE', 'DVI', 'COR \x0c     Date Requested:', 'PTH', 'ESR', 'CRP', 'CEA', 'C19', 'C15', 'FE, FBE', 'DVI', 'COR \x0c Patient Name', 'Medicare', 'SERUM CORTISOL', 'VBF', 'UMM', 'TAA', 'PTH', 'ESR', 'CRP', 'CEA', 'C19', 'C15', 'FE, FBE', 'COR \x0c Patient Name', 'Patient Address', 'Medicare', 'prolactin  IRON ST

In [5]:
# Function to add matcher patterns for units
def add_matcher_patterns(matcher):
    # Define the pattern for matching units
    pattern = [{"LOWER": {"IN": ["iu", "g", "mmol", "u"]}}, {"TEXT": "/"}, {"LOWER": {"IN": ["ml", "l"]}}]
    matcher.add("unit", [pattern])

# Initialize matcher with the shared vocab
matcher = Matcher(nlp.vocab)
add_matcher_patterns(matcher)

In [6]:
# Function to process units
def process_units(units):
    for unit in units:
        doc = nlp(unit)
        matches = matcher(doc)
        if matches:
            return unit  # Return the first matched unit
    return None  # Return None if no units match

# Function to determine if a string is a date
def is_date(string):
    return bool(re.match(r'\d{2}-\d{2}-\d{4}', string))

# Convert drug_data to the required format
output_data = {}
for drug, details in drug_data.items():
    # Filter out dates and get the last non-date value
    values = [v for v in details['values'] if not is_date(v)]
    last_value = values[-1] if values else None

    # Process units
    processed_unit = process_units(details['units'])

    # Store in new structure
    output_data[drug] = {
        "drug_name": drug,
        "value": last_value,
        "unit": processed_unit.split(" ")[0] if processed_unit else None
    }

# Save as JSON
json_output = json.dumps(output_data, indent=4)
print(json_output)

# Optionally, save to a file
with open('output_data.json', 'w') as json_file:
    json_file.write(json_output)

{
    "PRL": {
        "drug_name": "PRL",
        "value": "125",
        "unit": null
    },
    "Vitamin D": {
        "drug_name": "Vitamin D",
        "value": "196(11),686-687",
        "unit": null
    },
    "Iron": {
        "drug_name": "Iron",
        "value": "27",
        "unit": null
    },
    "T": {
        "drug_name": "T",
        "value": null,
        "unit": null
    },
    "T. Sat.": {
        "drug_name": "T. Sat.",
        "value": null,
        "unit": null
    },
    "Ferritin": {
        "drug_name": "Ferritin",
        "value": "between 50 and 100",
        "unit": null
    },
    "Anti-Thyroglobulin Abs": {
        "drug_name": "Anti-Thyroglobulin Abs",
        "value": "110",
        "unit": null
    },
    "Anti-Thyroidal Peroxidase Abs": {
        "drug_name": "Anti-Thyroidal Peroxidase Abs",
        "value": "332",
        "unit": "U/mL"
    },
    "Tg": {
        "drug_name": "Tg",
        "value": null,
        "unit": null
    },
    "B12": {
       