# Preparing

In [11]:
import pandas as pd
import json, re

# Loading data

In [12]:
train = pd.read_csv("data/train.csv")

In [13]:
train.head()

Unnamed: 0,ID,Note,json
0,1,**Clinical Notes**\n\n**Patient Information:**...,"{""patient_info"": {""age"": 41, ""gender"": ""Male""}..."
1,2,Clinical Notes:\n\nPatient: 56-year-old male\n...,"{""patient_info"": {""age"": 56, ""gender"": ""Male""}..."
2,3,Clinical Notes:\n\nPatient: 73-year-old female...,"{""patient_info"": {""age"": 73, ""gender"": ""Female..."
3,4,**Clinical Notes**\n\n**Patient Information:**...,"{""patient_info"": {""age"": 32, ""gender"": ""Female..."
4,5,"**Clinical Notes**\n\n**Patient:** Male, age 5...","{""patient_info"": {""age"": 51, ""gender"": ""Male""}..."


# Pre-Processing

## tokenization

In [14]:
def tokenize_text(text : str):
    """
    Tokenizes text while keeping patterns like 112/79, 39.4°C, and mg/dL intact,
    and separates numbers from hyphenated words.

    Args:
        text (str): The input text to tokenize.

    Returns:
        list: List of tokens.
    """
    # Remove '*' and lowercase the text
    text = text.replace('\n\n', '.')
    text = text.replace('*', '').lower()
    text = text.replace('-', ' ').lower()

    # regex to handle units and complex tokens
    tokens = re.findall(r'°c|\w+/\w+|\d+\.\d+|\d+/\d+|\d+|\w+|[^\w\s]', text)

    return tokens

In [15]:
train["Note"][0]

'**Clinical Notes**\n\n**Patient Information:**\n- Age: 41 years old\n- Gender: Male\n\n**Visit Motivation:** Anemia evaluation\n\n**Chief Complaints and Symptoms:**\nThe patient presents with complaints of anemia, fever, fatigue, difficulty breathing (dyspnea), vomiting, dizziness, blurred vision, wheezing, and pale skin. The patient reports that these symptoms have been progressively worsening over the past few weeks.\n\n**History of Present Illness:**\n- **Fever**: Intermittent low-grade fevers for approximately 2 weeks.\n- **Fatigue**: Persistent and severe fatigue affecting daily activities.\n- **Difficulty Breathing (Dyspnea)**: Experiencing shortness of breath, particularly with exertion. No history of chronic respiratory conditions.\n- **Vomiting**: Occasional episodes of non-bilious vomiting for the past week.\n- **Dizziness**: Episodes of dizziness, especially upon standing, which can be severe enough to cause near-fainting spells.\n- **Blurred Vision**: New onset of blurred 

In [16]:
# Example Usage
tokens = tokenize_text(train["Note"][0])
print(tokens)

['clinical', 'notes', '.', 'patient', 'information', ':', 'age', ':', '41', 'years', 'old', 'gender', ':', 'male', '.', 'visit', 'motivation', ':', 'anemia', 'evaluation', '.', 'chief', 'complaints', 'and', 'symptoms', ':', 'the', 'patient', 'presents', 'with', 'complaints', 'of', 'anemia', ',', 'fever', ',', 'fatigue', ',', 'difficulty', 'breathing', '(', 'dyspnea', ')', ',', 'vomiting', ',', 'dizziness', ',', 'blurred', 'vision', ',', 'wheezing', ',', 'and', 'pale', 'skin', '.', 'the', 'patient', 'reports', 'that', 'these', 'symptoms', 'have', 'been', 'progressively', 'worsening', 'over', 'the', 'past', 'few', 'weeks', '.', '.', 'history', 'of', 'present', 'illness', ':', 'fever', ':', 'intermittent', 'low', 'grade', 'fevers', 'for', 'approximately', '2', 'weeks', '.', 'fatigue', ':', 'persistent', 'and', 'severe', 'fatigue', 'affecting', 'daily', 'activities', '.', 'difficulty', 'breathing', '(', 'dyspnea', ')', ':', 'experiencing', 'shortness', 'of', 'breath', ',', 'particularly', 

## extract entities

In [17]:
def extract_entities_from_json(json_data):
    """
    Extract entities from the given JSON into a dictionary for annotation.

    Args:
        json_data (dict): The JSON containing patient information, symptoms, vital signs, etc.

    Returns:
        dict: A dictionary of entities categorized by type without repetitions.
    """

    entities = {
        "AGE": [str(json_data["patient_info"]["age"])],
        "GENDER": [json_data["patient_info"]["gender"].lower()],
        "VISIT_MOTIVATION": [json_data["visit_motivation"].replace('-', ' ').lower()],
        "SYMPTOM": [symptom.replace("_", " ").lower() for symptom in json_data["symptoms"]],
        "BODY_MEASURE": [],
        "VALUE": [],
        "UNIT": [],
    }

    # Extract vital signs
    vital_signs = json_data.get("vital_signs", {})
    for measure, details in vital_signs.items():
        if measure == "blood_pressure" and isinstance(details, dict):  # Special handling for blood pressure
            systolic = details.get("systolic", {})
            diastolic = details.get("diastolic", {})
            if systolic and diastolic:  # Combine systolic and diastolic
                entities["BODY_MEASURE"].append("blood pressure")
                entities["VALUE"].append(f"{systolic['value']}/{diastolic['value']}")
                if systolic.get("unit") == diastolic.get("unit"):  # Add unit only once if the same
                    entities["UNIT"].append(systolic["unit"].lower())
        elif isinstance(details, dict) and "value" in details:  # Handle simple vital signs
            measure_name = measure.replace("_", " ").lower()
            value = str(details["value"]).lower()
            unit = details.get("unit", "").lower()

            if measure_name not in entities["BODY_MEASURE"]:
                entities["BODY_MEASURE"].append(measure_name)
            if value not in entities["VALUE"]:
                entities["VALUE"].append(value)
            if unit not in entities["UNIT"]:
                entities["UNIT"].append(unit)

    return entities

In [18]:
# Example Usage
entities = extract_entities_from_json(json.loads(train["json"][0]))
for key, value in entities.items():
    print(f"{key}: {value}")

AGE: ['41']
GENDER: ['male']
VISIT_MOTIVATION: ['anemia']
SYMPTOM: ['fever', 'fatigue', 'difficulty breathing', 'vomiting', 'dizziness', 'blurred vision', 'wheezing', 'pale skin']
BODY_MEASURE: ['heart rate', 'oxygen saturation', 'cholesterol level', 'glucose level']
VALUE: ['114', '98.4', '132.8', '110.6']
UNIT: ['bpm', '%', 'mg/dl']


## covert json to entities

In [19]:
def annotate_text(tokens, entities):
    """
    Annotate tokens with BIO tags based on the provided entities and collect missed entities.

    Args:
        tokens (list of str): The tokenized text.
        entities (dict): Dictionary of entity types and their corresponding values.

    Returns:
        list of tuple: List of (token, tag) tuples representing the BIO-annotated tokens.
        dict: Dictionary of missed entities that did not find a match in the text.
    """
    bio_tags = []
  
    # Iterate over tokens
    for i in range(len(tokens)):
        matched = False
        # Check each entity type
        for entity_type, entity_values in entities.items():
            # Check each entity value
            for entity in entity_values:
                entity_tokens = tokenize_text(entity)  # Tokenize the entity value

                # Loop through words in entity
                for e_token in entity_tokens:
                    # Check if there's a word match
                    if tokens[i] == e_token:
                        # Check if first
                        if e_token == entity_tokens[0]:
                            bio_tags.append((tokens[i], f"B-{entity_type}"))
                            # Mark as matched
                            matched = True
                            break
                        elif i != 0 and (bio_tags[i-1][1] == f"B-{entity_type}" or bio_tags[i-1][1] == f"I-{entity_type}"):
                            bio_tags.append((tokens[i], f"I-{entity_type}"))
                            # Mark as matched
                            matched = True
                            break

                if matched:
                    break  # Stop checking further entity values for this token
            if matched:
                break  # Stop checking further entity types

        if not matched:
            bio_tags.append((tokens[i], "O"))  # Outside any entity

    return bio_tags

## test

In [20]:
j = 10708
tokens = tokenize_text(train["Note"][j])
entities = extract_entities_from_json(json.loads(train["json"][j]))
bio_tags = annotate_text(tokens, entities)

In [21]:
bio_tags

[('clinical', 'O'),
 ('notes', 'O'),
 (':', 'O'),
 ('.', 'O'),
 ('patient', 'O'),
 ('profile', 'O'),
 (':', 'O'),
 ('the', 'O'),
 ('patient', 'O'),
 ('is', 'O'),
 ('a', 'O'),
 ('31', 'B-AGE'),
 ('year', 'O'),
 ('old', 'O'),
 ('female', 'B-GENDER'),
 ('presenting', 'O'),
 ('with', 'O'),
 ('a', 'O'),
 ('chief', 'O'),
 ('complaint', 'O'),
 ('of', 'O'),
 ('anxiety', 'B-VISIT_MOTIVATION'),
 ('disorders', 'I-VISIT_MOTIVATION'),
 ('.', 'O'),
 ('.', 'O'),
 ('history', 'O'),
 ('of', 'O'),
 ('present', 'O'),
 ('illness', 'O'),
 (':', 'O'),
 ('the', 'O'),
 ('patient', 'O'),
 ('reports', 'O'),
 ('experiencing', 'O'),
 ('a', 'O'),
 ('variety', 'O'),
 ('of', 'O'),
 ('concerning', 'O'),
 ('symptoms', 'O'),
 (',', 'O'),
 ('including', 'O'),
 ('cough', 'B-SYMPTOM'),
 (',', 'O'),
 ('difficulty', 'B-SYMPTOM'),
 ('breathing', 'I-SYMPTOM'),
 (',', 'O'),
 ('diarrhea', 'B-SYMPTOM'),
 (',', 'O'),
 ('runny', 'B-SYMPTOM'),
 ('nose', 'I-SYMPTOM'),
 (',', 'O'),
 ('rash', 'B-SYMPTOM'),
 (',', 'O'),
 ('joint', 'B-S

In [22]:
# bio_tags is a list of (token, tag) tuples
filtered_bio_tags = [tag for tag in bio_tags if tag[1] != 'O']

# Print the filtered bio tags
for tag in filtered_bio_tags:
    print(tag)

('31', 'B-AGE')
('female', 'B-GENDER')
('anxiety', 'B-VISIT_MOTIVATION')
('disorders', 'I-VISIT_MOTIVATION')
('cough', 'B-SYMPTOM')
('difficulty', 'B-SYMPTOM')
('breathing', 'I-SYMPTOM')
('diarrhea', 'B-SYMPTOM')
('runny', 'B-SYMPTOM')
('nose', 'I-SYMPTOM')
('rash', 'B-SYMPTOM')
('joint', 'B-SYMPTOM')
('pain', 'I-SYMPTOM')
('sneezing', 'B-SYMPTOM')
('blurred', 'B-SYMPTOM')
('vision', 'I-SYMPTOM')
('painful', 'B-SYMPTOM')
('urination', 'I-SYMPTOM')
('dry', 'B-SYMPTOM')
('skin', 'I-SYMPTOM')
('pale', 'B-SYMPTOM')
('skin', 'I-SYMPTOM')
('anxiety', 'B-VISIT_MOTIVATION')
('restlessness', 'B-SYMPTOM')
('difficulty', 'B-SYMPTOM')
('concentrating', 'I-SYMPTOM')
('respiratory', 'B-BODY_MEASURE')
('cough', 'B-SYMPTOM')
('difficulty', 'B-SYMPTOM')
('breathing', 'I-SYMPTOM')
('diarrhea', 'B-SYMPTOM')
('skin', 'I-SYMPTOM')
('rash', 'B-SYMPTOM')
('dry', 'B-SYMPTOM')
('skin', 'I-SYMPTOM')
('pale', 'B-SYMPTOM')
('skin', 'I-SYMPTOM')
('joint', 'B-SYMPTOM')
('pain', 'I-SYMPTOM')
('blurred', 'B-SYMPTOM')

<div dir="rtl">
ملاحظات

لما تنكتب كلمة بالنص غير بالجسون متل:

- Temp: 36.6°C

جسون:

- "temperature": {"value": 36.6, "unit": "\\u00b0C"}

مابينعملا تعليق
</div>

# Convert to BIO

## bio file

In [23]:
def write_bio_file(bio_tags, file_path):
    """
    Write BIO tags to a file.

    Args:
        bio_tags (list of tuple): List of (token, tag) tuples from the `annotate_text` function.
        file_path (str): Path to the output BIO file.

    Returns:
        None
    """
    with open(file_path, "w", encoding="utf-8") as bio_file:
        for token, tag in bio_tags:
            if token == "\n":  # Treat "\n" as a sentence separator
                bio_file.write("\n")
            else:
                bio_file.write(f"{token} {tag}\n")
        # Ensure the file ends with a blank line
        bio_file.write("\n")


In [24]:
j = 10708
tokens = tokenize_text(train["Note"][j])
entities = extract_entities_from_json(json.loads(train["json"][j]))
bio_tags = annotate_text(tokens, entities)
write_bio_file(bio_tags, "output.bio")

## html visualization

In [27]:
from pathlib import Path

def bio_to_html(bio_tags, output_file="visualization.html"):
    """
    Convert BIO-tagged data into a colorful HTML visualization using the Open Sans font.
    
    Args:
        bio_tags (list of tuple): List of (token, tag) tuples.
        output_file (str): Filepath to save the HTML visualization.

    Returns:
        None
    """
    # Predefined colors for entities
    predefined_colors = ["#bde0fe", "#f6d5e5", "#c5b4e3", "#e2f0cb", "#fde2e4", "#fcb045", "#70dbcd"]
    entity_colors = {}  # Dictionary to store colors for each entity type
    color_index = 0

    html_content = [
        '<html>',
        '<head>',
        '<link href="https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;700&display=swap" rel="stylesheet">',
        '</head>',
        '<body style="font-family: \'Open Sans\', sans-serif; line-height: 1.6;">'
    ]
    html_content.append('<h1 style="font-weight: 700;">BIO File Visualization</h1>')

    for token, tag in bio_tags:
        if tag == "O":  # Outside entity
            html_content.append(f'<span style="margin: 2px;">{token}</span>')
        else:
            entity_type = tag.split("-")[-1]  # Extract entity type
            if entity_type not in entity_colors:
                # Assign predefined colors in order
                entity_colors[entity_type] = predefined_colors[color_index % len(predefined_colors)]
                color_index += 1
            color = entity_colors[entity_type]
            html_content.append(
                f'<span style="background-color: {color}; margin: 2px; padding: 2px; border-radius: 4px;">'
                f'{token} <b>{entity_type}</b></span>'
            )
        html_content.append(" ")  # Add space between tokens
    
    html_content.append("</body></html>")

    # Write the HTML content to the output file
    Path(output_file).write_text("\n".join(html_content), encoding="utf-8")
    # print(f"Visualization saved to {output_file}")

## convert dataset

In [34]:
train["ID"][0]

1

In [37]:
def convert_df_to_bio(df):
    for i in range(len(df)):
        print(f"Processing Note {i} ========")
        tokens = tokenize_text(df["Note"][i])
        entities = extract_entities_from_json(json.loads(df["json"][i]))
        bio_tags = annotate_text(tokens, entities)
        write_bio_file(bio_tags, f"bio/note_{df['ID'][i]}.bio")
        bio_to_html(bio_tags, f"html/note_{df['ID'][i]}.html")

In [38]:
convert_df_to_bio(train)

