In [17]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [18]:
#!pip install spacy

In [19]:
import spacy

In [20]:
ds = load_dataset("argilla/medical-domain")
train_data = ds['train']

In [21]:
def inspect_data(data, num_samples=5):
    # Print out the full structure of dataset
    for i, item in enumerate(data[:num_samples], 1):
        print(f"Features {i}: {item}\n")

    # Print out the structure and content of the first few data entries
    for i in range(num_samples):
        sample = data[i]
        print(f"Sample {i+1}: {sample}\n")

# Run the inspection of data
inspect_data(train_data)

Features 1: text

Features 2: inputs

Features 3: prediction

Features 4: prediction_agent

Features 5: annotation

Features 6: annotation_agent

Features 7: multi_label

Features 8: explanation

Features 9: id

Features 10: metadata

Features 11: status

Features 12: event_timestamp

Features 13: metrics

Sample 1: {'text': 'PREOPERATIVE DIAGNOSIS:,  Iron deficiency anemia.,POSTOPERATIVE DIAGNOSIS:,  Diverticulosis.,PROCEDURE:,  Colonoscopy.,MEDICATIONS: , MAC.,PROCEDURE: , The Olympus pediatric variable colonoscope was introduced into the rectum and advanced carefully through the colon to the cecum identified by the ileocecal valve and the appendiceal orifice.  Preparation was good, although there was some residual material in the cecum that was difficult to clear completely.  The mucosa was normal throughout the colon.  No polyps or other lesions were identified, and no blood was noted.  Some diverticula were seen of the sigmoid colon with no luminal narrowing or evidence of inflamm

## Standard and potential new NER types
Standard NER types that are prominent in our data are :
* Person: often anonymized, ex. Dr. X
* date and time: of diagnosis, treatments
* location: different clinics
* organizations: departments
* cardinal: dosages, quantification
* percent
* quantification
* product


Potential new NER types that are prominent in our data are:
* diagnosis
* symptoms
* treatment
* medication
* lab values
* body parts
* medical equipment
* genetic markers
* dosage information
* hospital department
* allergies
* clinical tests

## Apply the standard NER classifier of spaCy to our data

In [22]:
#!python -m spacy download en_core_web_sm

In [23]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [24]:
# Define a function to apply NER on the text
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]


In [30]:
# Apply the function to the 'text' column of your train data
results = []
for example in train_data:
    text = example['text'] 
    entities = extract_entities(text)
    results.append({'text': text, 'entities': entities})

In [31]:
# Print some results
for result in results[:5]:  # Print the first 5 results
    print(f"Text: {result['text']}")
    print(f"Entities: {result['entities']}\n")

Text: PREOPERATIVE DIAGNOSIS:,  Iron deficiency anemia.,POSTOPERATIVE DIAGNOSIS:,  Diverticulosis.,PROCEDURE:,  Colonoscopy.,MEDICATIONS: , MAC.,PROCEDURE: , The Olympus pediatric variable colonoscope was introduced into the rectum and advanced carefully through the colon to the cecum identified by the ileocecal valve and the appendiceal orifice.  Preparation was good, although there was some residual material in the cecum that was difficult to clear completely.  The mucosa was normal throughout the colon.  No polyps or other lesions were identified, and no blood was noted.  Some diverticula were seen of the sigmoid colon with no luminal narrowing or evidence of inflammation.  A retroflex view of the anorectal junction showed no hemorrhoids.  The patient tolerated the procedure well and was sent to the recovery room.,FINAL DIAGNOSES:,1.  Diverticulosis in the sigmoid.,2.  Otherwise normal colonoscopy to the cecum.,RECOMMENDATIONS:,1.  Follow up with Dr. X as needed.,2.  Screening colon

In [32]:
import pandas as pd
# Convert to a DataFrame and save as a CSV
df = pd.DataFrame(results)
df.to_csv('results.csv', index=False)

# Evaluate the NER classification

No ground truth? How to evaluate?

Try to compare with a different model: Flair

In [34]:
#pip install flair

In [36]:
from flair.models import SequenceTagger
from flair.data import Sentence

# Load the NER tagger
tagger = SequenceTagger.load('ner')


2024-11-04 08:53:35,186 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [37]:
# Define a function to apply NER on the text
def extract_entities(text):
    sentence = Sentence(text)
    tagger.predict(sentence)
    return [(entity.text, entity.get_label('ner').value) for entity in sentence.get_spans('ner')]


In [None]:
# Apply the function to the 'text' column of your train data
results_flair = []
for example in train_data:
    text = example['text']
    entities = extract_entities(text)
    results_flair.append({'text': text, 'entities': entities})

In [None]:
# Print some results
for result in results_flair[:5]:  # Print the first 5 results
    print(f"Text: {result['text']}")
    print(f"Entities: {result['entities']}\n")