For all files to read from specific directory

In [5]:
import os
import re

In [None]:
"""
Task description: This code is written to add tags for corresponding concepts to feed into LLMs

Read record-XX.txt:
At Vencor Hospital , BP dropped to 90 , and patient was started on NS IVF , with improvement in BP .

Read record-XX.con:
c="bp" 31:20 1:20||t="test"
c="bp dropped" 1:4 1:5||t="problem"
c="ns ivf" 1:14 1:15||t="treatment"

Output record-XX-llm.txt:
At Vencor Hospital , <problem> BP dropped </problem> to 90 , and patient was started on <treatment> NS IVF </treatment> , with improvement in <test> BP </test> .
"""

def generate_llm_file_with_sentence_matching(text_file, concept_file, output_file):
    # Read the text file
    with open(text_file, 'r') as txt_f:
        lines = txt_f.readlines()

    # Read the concept file
    with open(concept_file, 'r') as con_f:
        concepts = con_f.readlines()

    # Parse the concepts
    tag_map = {}
    for concept in concepts:
        if concept.strip():
            match = re.match(r'c="(.+?)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.+?)"', concept.strip())
            if match:
                phrase, start_line, start_word, end_line, end_word, tag = match.groups()
                start_line, start_word, end_line, end_word = map(int, (start_line, start_word, end_line, end_word))
                start_line -= 1  # Convert to zero-based index
                end_line -= 1  # Convert to zero-based index

                if start_line == end_line:  # Ensure the concept is within one sentence
                    if start_line not in tag_map:
                        tag_map[start_line] = []
                    tag_map[start_line].append((start_word, end_word, phrase, tag))

    # Process the text file
    updated_lines = []
    for i, line in enumerate(lines):
        updated_line = line.strip().split()  # Tokenize the line into words
        if i in tag_map:
            # Sort concepts by start and end indices to avoid overlap issues
            concepts = sorted(tag_map[i], key=lambda x: (x[0], x[1]))
            for start_word, end_word, phrase, tag in reversed(concepts):  # Process from end to start
                # Wrap the specific phrase in its tag
                updated_line[start_word:end_word + 1] = [f"<{tag}>"] + updated_line[start_word:end_word + 1] + [f"</{tag}>"]

        # Reconstruct the updated line
        updated_lines.append(" ".join(updated_line) + "\n")

    # Write the updated text to the output file
    with open(output_file, 'w') as out_f:
        out_f.writelines(updated_lines)

In [7]:
def process_all_files(txt_dir, concept_dir, output_dir):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Get list of all text files and concept files
    txt_files = {os.path.splitext(f)[0]: os.path.join(txt_dir, f) for f in os.listdir(txt_dir) if f.endswith('.txt')}
    concept_files = {os.path.splitext(f)[0]: os.path.join(concept_dir, f) for f in os.listdir(concept_dir) if f.endswith('.con')}
    
    # Process each matching pair of text and concept files
    #count = 0
    for file_id in txt_files.keys() & concept_files.keys():
        text_file = txt_files[file_id]
        concept_file = concept_files[file_id]
        output_file = os.path.join(output_dir, f"{file_id}-llm.txt")

        print(f"Processing: {text_file} and {concept_file} -> {output_file}")
        generate_llm_file_with_sentence_matching(text_file, concept_file, output_file)
        #count+=1
        #if count > 5: break

In [None]:
# Directory paths for train data processing
txt_dir = "../Data/raw/concept_assertion_relation_training_data/beth/txt/"
concept_dir = "../Data/raw/concept_assertion_relation_training_data/beth/concept/"
output_dir = "../Data/raw/concept_assertion_relation_training_data/beth/llm/"

# Process all files
process_all_files(txt_dir, concept_dir, output_dir)

In [None]:
# Directory paths for test data processing
txt_dir = "../Data/raw/test_data"
concept_dir = "../Data/raw/reference_standard_for_test_data/concepts/"
output_dir = "../Data/raw/reference_standard_for_test_data/llm/"

# Process all files
process_all_files(txt_dir, concept_dir, output_dir)

Processing: ../Data/raw/test_data\0094.txt and ../Data/raw/reference_standard_for_test_data/concepts/0094.con -> ../Data/raw/reference_standard_for_test_data/llm/0094-llm.txt
Processing: ../Data/raw/test_data\0242.txt and ../Data/raw/reference_standard_for_test_data/concepts/0242.con -> ../Data/raw/reference_standard_for_test_data/llm/0242-llm.txt
Processing: ../Data/raw/test_data\0178.txt and ../Data/raw/reference_standard_for_test_data/concepts/0178.con -> ../Data/raw/reference_standard_for_test_data/llm/0178-llm.txt
Processing: ../Data/raw/test_data\0257.txt and ../Data/raw/reference_standard_for_test_data/concepts/0257.con -> ../Data/raw/reference_standard_for_test_data/llm/0257-llm.txt
Processing: ../Data/raw/test_data\0245.txt and ../Data/raw/reference_standard_for_test_data/concepts/0245.con -> ../Data/raw/reference_standard_for_test_data/llm/0245-llm.txt
Processing: ../Data/raw/test_data\0149.txt and ../Data/raw/reference_standard_for_test_data/concepts/0149.con -> ../Data/raw/

Process a single file for testing

In [None]:
import os
import re

def generate_llm_file_with_sentence_matching(text_file, concept_file, output_file):
    # Read the text file
    with open(text_file, 'r') as txt_f:
        lines = txt_f.readlines()

    # Read the concept file
    with open(concept_file, 'r') as con_f:
        concepts = con_f.readlines()

    # Parse the concepts
    tag_map = {}
    for concept in concepts:
        if concept.strip():
            match = re.match(r'c="(.+?)" (\d+):(\d+) (\d+):(\d+)\|\|t="(.+?)"', concept.strip())
            if match:
                phrase, start_line, start_word, end_line, end_word, tag = match.groups()
                start_line, start_word, end_line, end_word = map(int, (start_line, start_word, end_line, end_word))
                start_line -= 1  # Convert to zero-based index
                end_line -= 1  # Convert to zero-based index

                if start_line == end_line:  # Ensure the concept is within one sentence
                    if start_line not in tag_map:
                        tag_map[start_line] = []
                    tag_map[start_line].append((start_word, end_word, phrase, tag))

    # Process the text file
    updated_lines = []
    for i, line in enumerate(lines):
        updated_line = line.strip().split()  # Tokenize the line into words
        if i in tag_map:
            # Sort concepts by start and end indices to avoid overlap issues
            concepts = sorted(tag_map[i], key=lambda x: (x[0], x[1]))
            for start_word, end_word, phrase, tag in reversed(concepts):  # Process from end to start
                # Wrap the specific phrase in its tag
                updated_line[start_word:end_word + 1] = [f"<{tag}>"] + updated_line[start_word:end_word + 1] + [f"</{tag}>"]

        # Reconstruct the updated line
        updated_lines.append(" ".join(updated_line) + "\n")

    # Write the updated text to the output file
    with open(output_file, 'w') as out_f:
        out_f.writelines(updated_lines)


def process_specific_file(txt_dir, concept_dir, output_dir, file_id):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    text_file = os.path.join(txt_dir, f"{file_id}.txt")
    concept_file = os.path.join(concept_dir, f"{file_id}.con")
    output_file = os.path.join(output_dir, f"{file_id}-llm.txt")

    # Check if files exist
    if os.path.exists(text_file) and os.path.exists(concept_file):
        print(f"Processing: {text_file} and {concept_file} -> {output_file}")
        generate_llm_file_with_sentence_matching(text_file, concept_file, output_file)
    else:
        print(f"Error: {file_id}.txt or {file_id}.con does not exist in the specified directories.")


# Directory paths
txt_dir = "../Data/raw/concept_assertion_relation_training_data/beth/txt/"
concept_dir = "../Data/raw/concept_assertion_relation_training_data/beth/concept/"
output_dir = "../Data/raw/concept_assertion_relation_training_data/beth/llm/"

# Specify the file ID to process
file_id = "record-13"

# Process the specific file
process_specific_file(txt_dir, concept_dir, output_dir, file_id)