In [37]:
import pandas as pd
import numpy as np
from collections import Counter
import langdetect
from datasets import load_dataset

In [10]:
ds = load_dataset("argilla/medical-domain")
train_data = ds['train']

ds

DatasetDict({
    train: Dataset({
        features: ['text', 'inputs', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'multi_label', 'explanation', 'id', 'metadata', 'status', 'event_timestamp', 'metrics'],
        num_rows: 4966
    })
})

# Exploring dataset

In [59]:
def inspect_data(data, num_samples=5):
    # Print out the full structure of dataset
    for i, item in enumerate(data[:num_samples], 1):
        print(f"Features {i}: {item}\n")

    # Print out the structure and content of the first few data entries
    for i in range(num_samples):
        sample = data[i]
        print(f"Sample {i+1}: {sample}\n")

# Run the inspection of data
inspect_data(train_data)

Features 1: text

Features 2: inputs

Features 3: prediction

Features 4: prediction_agent

Features 5: annotation

Features 6: annotation_agent

Features 7: multi_label

Features 8: explanation

Features 9: id

Features 10: metadata

Features 11: status

Features 12: event_timestamp

Features 13: metrics

Sample 1: {'text': 'PREOPERATIVE DIAGNOSIS:,  Iron deficiency anemia.,POSTOPERATIVE DIAGNOSIS:,  Diverticulosis.,PROCEDURE:,  Colonoscopy.,MEDICATIONS: , MAC.,PROCEDURE: , The Olympus pediatric variable colonoscope was introduced into the rectum and advanced carefully through the colon to the cecum identified by the ileocecal valve and the appendiceal orifice.  Preparation was good, although there was some residual material in the cecum that was difficult to clear completely.  The mucosa was normal throughout the colon.  No polyps or other lesions were identified, and no blood was noted.  Some diverticula were seen of the sigmoid colon with no luminal narrowing or evidence of inflamm

In [54]:
# 1. Basic statistics
def basic_stats(data):
    # Number of samples
    num_samples = len(data)
    
    # Extracting unique prediction label values from the dataset
    predictions = list(item['label'] 
    for example in ds['train'] 
    for item in example['prediction'] 
    if example['prediction'] not in [None, ''])
    unique_predictions = len(set(predictions))

    # Counting the frequency of each label
    prediction_counts = Counter(predictions)
        
    # Min/avg/max length of text
    text_lengths_sent = [len(item['text'].split()) for item in data if item['text'] is not None]
    min_length_sent = np.min(text_lengths_sent)
    avg_length_sent = np.mean(text_lengths_sent)
    max_length_sent = np.max(text_lengths_sent)

    # Min/avg/max length of text in characters
    text_lengths_char = [len(item['text']) for item in data if item['text'] is not None]
    min_length_char = np.min(text_lengths_char)
    avg_length_char = np.mean(text_lengths_char)
    max_length_char = np.max(text_lengths_char)
    
    print(f"Number of samples: {num_samples}")
    print(f"Number of classes: {unique_predictions}")
    print(f"Number of samples per class: {prediction_counts}")
    print(f"Text length (# sentences) - Min: {min_length_sent}, Avg: {avg_length_sent:.1f}, Max: {max_length_sent}")
    print(f"Text length (# characters) - Min: {min_length_char}, Avg: {avg_length_char:.1f}, Max: {max_length_char}")

In [55]:
basic_stats(train_data)

Number of samples: 4966
Number of classes: 40
Number of samples per class: Counter({' Surgery': 1088, ' Consult - History and Phy.': 516, ' Cardiovascular / Pulmonary': 371, ' Orthopedic': 355, ' Radiology': 273, ' General Medicine': 259, ' Gastroenterology': 224, ' Neurology': 223, ' SOAP / Chart / Progress Notes': 166, ' Urology': 156, ' Obstetrics / Gynecology': 155, ' Discharge Summary': 108, ' ENT - Otolaryngology': 96, ' Neurosurgery': 94, ' Hematology - Oncology': 90, ' Ophthalmology': 83, ' Nephrology': 81, ' Emergency Room Reports': 75, ' Pediatrics - Neonatal': 70, ' Pain Management': 61, ' Psychiatry / Psychology': 53, ' Office Notes': 50, ' Podiatry': 47, ' Dermatology': 29, ' Dentistry': 27, ' Cosmetic / Plastic Surgery': 27, ' Letters': 23, ' Physical Medicine - Rehab': 21, ' Sleep Medicine': 20, ' Endocrinology': 19, ' Bariatrics': 18, ' IME-QME-Work Comp etc.': 16, ' Chiropractic': 14, ' Diets and Nutritions': 10, ' Rheumatology': 10, ' Speech - Language': 9, ' Lab Medi

Conclusion: The dataset is not balanced, as about 1/3 of the samples are from surgery.

In [56]:
# 2. Determine national languages used
def detect_languages(data):
    text_data = [item['text'] for item in data if item['text'] is not None]
    
    detected_languages = [langdetect.detect(text) for text in text_data]
    language_counts = Counter(detected_languages)
    
    print(f"Top 5 detected languages: {language_counts.most_common(5)}")

In [57]:
detect_languages(train_data)

Top 5 detected languages: [('en', 4954), ('pt', 3), ('so', 2), ('de', 2), ('tl', 2)]


In [42]:
# 3. Review 100+ samples for style, vocabulary, and spelling
def sample_analysis(data, num_samples=100):
    sample_texts = [item['text'] for item in data if item['text'] is not None]
    
    for i, text in enumerate(sample_texts[:num_samples], 1):
        print(f"Sample {i}:\n{text[:500]}...\n")  # Show first 500 characters for brevity

In [43]:
sample_analysis(train_data)

Sample 1:
PREOPERATIVE DIAGNOSIS:,  Iron deficiency anemia.,POSTOPERATIVE DIAGNOSIS:,  Diverticulosis.,PROCEDURE:,  Colonoscopy.,MEDICATIONS: , MAC.,PROCEDURE: , The Olympus pediatric variable colonoscope was introduced into the rectum and advanced carefully through the colon to the cecum identified by the ileocecal valve and the appendiceal orifice.  Preparation was good, although there was some residual material in the cecum that was difficult to clear completely.  The mucosa was normal throughout the c...

Sample 2:
CLINICAL INDICATION:  ,Normal stress test.,PROCEDURES PERFORMED:,1.  Left heart cath.,2.  Selective coronary angiography.,3.  LV gram.,4.  Right femoral arteriogram.,5.  Mynx closure device.,PROCEDURE IN DETAIL: , The patient was explained about all the risks, benefits, and alternatives of this procedure.  The patient agreed to proceed and informed consent was signed.,Both groins were prepped and draped in the usual sterile fashion.  After local anesthesia with 2% lidoca

Noteworthy style: Most of the time, sentences are not full but rather phrases with short descriptions and lacking transition words (such as a, the, and).

Vocabulary: Because of the content, there are a lot of medical terms in the text.

Spelling: Given that medical records are completed with the goal of optimizing time spent on patients, spelling and grammar are not emphasized. This results in messy texts.