In [1]:
%%capture
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
!pip install --upgrade scispacy

!pip install -q transformers
!pip install -q datasets
# !pip install -q sentencepiece
!pip install rouge
!pip install bert_score
!pip install rouge-score
!pip install sacrebleu
# !pip install --upgrade datasets
!pip install evaluate

In [2]:
%%capture
import spacy
import scispacy
from scispacy.linking import EntityLinker

import tensorflow as tf
import numpy as np
import pandas as pd
import torch
import transformers
import os
import re
import json
import bert_score
import rouge
import sacrebleu
import evaluate

import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset
from datasets import DatasetDict


# from transformers import pipeline, T5Tokenizer, TFT5Model, T5ForConditionalGeneration, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import LongformerTokenizerFast, BartForConditionalGeneration, Trainer, TrainingArguments, BartTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

#evaluation packages
#rogue score
from rouge import Rouge
from evaluate import load
# BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference sentences by cosine similarity.
from bert_score import BERTScorer

import shutil

In [3]:
#mounting Google Drive to save model
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###1.2 Importing dataset from HuggingFace (save HG token in secrets)

In [None]:
ds = load_dataset("Bilal-Mamji/Medical-summary")

In [None]:
#validating dataset
print(f"{len(ds['train'])} training pairs")
print(f"{len(ds['validation'])} validation pairs")
print(f"{len(ds['test'])} test pairs")

In [5]:
#renaming headers and deleting instructions column
ds = ds.remove_columns(['instruction']) #not relevant to the model baseline
ds = ds.rename_column('input', 'input_text')
ds = ds.rename_column('output', 'target_text')


In [6]:
#saving data to a dataset DatasetDict to work with LongBART tokenization
dataset = DatasetDict({
    'train': ds['train'],
    'validation': ds['validation'],
    'test': ds['test']
})

##1.3 Creating NLP Pipeline for UMLS Entitiy Linker

1.   Load model, create pipeline with configs
2.   Save pipeline to Google Drive.
3. Load pipeline from Google Drive



In [None]:
%%time
# Load the SciSpacy model
nlp = spacy.load("en_core_sci_sm")

# Add the SciSpacy linker to the pipeline with the desired configuration
# nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True,
#                                         "linker_name": "umls",
#                                         "incl_context": True,
#                                         "threshold": 0.85,
#                                         "batch_size": 128,
#                                         "candidates_batch_size": 64,
#                                         "filter_for_definition": True})

nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True,
                                        "linker_name": "umls",
                                        "threshold": 0.85})

In [None]:

# Save the pipeline
pipeline_path = "/content/drive/My Drive/scispacy_pipeline"
nlp.to_disk(pipeline_path)

print(f"Pipeline saved to {pipeline_path}")

In [7]:
# Load the pipeline
pipeline_path = "/content/drive/My Drive/scispacy_pipeline"
nlp = spacy.load(pipeline_path)

print("Pipeline loaded successfully")

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmps73vrw_8


100%|██████████| 492M/492M [00:10<00:00, 50.7MiB/s]


Finished download, copying /tmp/tmps73vrw_8 to cache at /root/.scispacy/datasets/2b79923846fb52e62d686f2db846392575c8eb5b732d9d26cd3ca9378c622d40.87bd52d0f0ee055c1e455ef54ba45149d188552f07991b765da256a1b512ca0b.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin not found in cache, downloading to /tmp/tmpotu60_h1


100%|██████████| 724M/724M [00:27<00:00, 27.7MiB/s]


Finished download, copying /tmp/tmpotu60_h1 to cache at /root/.scispacy/datasets/7e8e091ec80370b87b1652f461eae9d926e543a403a69c1f0968f71157322c25.6d801a1e14867953e36258b0e19a23723ae84b0abd2a723bdd3574c3e0c873b4.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmpnibg8j2s


100%|██████████| 1.32M/1.32M [00:00<00:00, 5.70MiB/s]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Finished download, copying /tmp/tmpnibg8j2s to cache at /root/.scispacy/datasets/37bc06bb7ce30de7251db5f5cbac788998e33b3984410caed2d0083187e01d38.f0994c1b61cc70d0eb96dea4947dddcb37460fb5ae60975013711228c8fe3fba.tfidf_vectorizer.joblib


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json not found in cache, downloading to /tmp/tmpd67zjkio


100%|██████████| 264M/264M [00:05<00:00, 49.8MiB/s]


Finished download, copying /tmp/tmpd67zjkio to cache at /root/.scispacy/datasets/6238f505f56aca33290aab44097f67dd1b88880e3be6d6dcce65e56e9255b7d4.d7f77b1629001b40f1b1bc951f3a890ff2d516fb8fbae3111b236b31b33d6dcf.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2023-04-23/umls_2022_ab_cat0129.jsonl not found in cache, downloading to /tmp/tmpbf3omcgo


100%|██████████| 628M/628M [00:14<00:00, 46.1MiB/s]


Finished download, copying /tmp/tmpbf3omcgo to cache at /root/.scispacy/datasets/d5e593bc2d8adeee7754be423cd64f5d331ebf26272074a2575616be55697632.0660f30a60ad00fffd8bbf084a18eb3f462fd192ac5563bf50940fc32a850a3c.umls_2022_ab_cat0129.jsonl
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv not found in cache, downloading to /tmp/tmp9c0zt1ib


100%|██████████| 4.26k/4.26k [00:00<00:00, 3.03MiB/s]

Finished download, copying /tmp/tmp9c0zt1ib to cache at /root/.scispacy/datasets/21a1012c532c3a431d60895c509f5b4d45b0f8966c4178b892190a302b21836f.330707f4efe774134872b9f77f0e3208c1d30f50800b3b39a6b8ec21d9adf1b7.umls_semantic_type_tree.tsv



  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


Pipeline loaded successfully


###1.3.1 Testing UMLS Entity Pipeline

In [19]:
text = """
The patient reports experiencing shortness of breath and chest tightness.
They were prescribed Lisinopril and diagnosed with hypertension.
During the visit, their blood pressure was measured at 150/100.
The doctor recommended lifestyle changes.
"""

# Process the text
doc = nlp(text)

# Extract UMLS entities
def extract_umls_entities(doc):
    annotations = []
    linker = nlp.get_pipe("scispacy_linker")

    for entity in doc.ents:
        if entity._.kb_ents:  # Check if the entity has UMLS links
            for umls_ent in entity._.kb_ents[:1]:  # Top match (highest confidence)
                cui = umls_ent[0]
                score = umls_ent[1]
                umls_entity = linker.kb.cui_to_entity[cui]
                annotations.append({
                    "entity": entity.text,
                    "start": entity.start_char,
                    "end": entity.end_char,
                    "cui": cui,
                    "name": umls_entity.canonical_name,
                    "definition": umls_entity.definition,
                    "semantic_types": umls_entity.types,
                    "confidence": score
                })
    return annotations

annotations = extract_umls_entities(doc)

In [None]:
annotations

In [15]:
# del annotations
import gc
gc.collect()
print(f"Memory usage before deletion: {psutil.virtual_memory().used / 1e9:.2f} GB")

Memory usage before deletion: 13.62 GB


##2. UMLS Entity Approach

1.   Preprocessing the input_data (train, validate, test) with UMLS
2.   List item



In [16]:
#read in TUI SOAP categorization (customized)
df = pd.read_csv('/content/drive/My Drive/TUI_manual.csv')

In [22]:
# Extract UMLS entities
def extract_umls_entities(doc):
    annotations = []
    linker = nlp.get_pipe("scispacy_linker")

    for entity in doc.ents:
        if entity._.kb_ents:  # Check if the entity has UMLS links
            for umls_ent in entity._.kb_ents[:1]:  # Top match (highest confidence)
                cui = umls_ent[0]
                score = umls_ent[1]
                umls_entity = linker.kb.cui_to_entity[cui]
                annotations.append({
                    "entity": entity.text,
                    "start": entity.start_char,
                    "end": entity.end_char,
                    "cui": cui,
                    "name": umls_entity.canonical_name,
                    "definition": umls_entity.definition,
                    "semantic_types": umls_entity.types,
                    "confidence": score
                })
    return annotations

In [23]:
from re import sub
# Convert the desired column to a list instead of the entire DataFrame
subjective = df[df['SOAP'] == 'Subjective']['TUI'].tolist() # Assuming 'TUI' is the column you want
objective = df[df['SOAP'] == 'Objective']['TUI'].tolist()  # Assuming 'TUI' is the column you want
assessment = df[df['SOAP'] == 'Assessment']['TUI'].tolist() # Assuming 'TUI' is the column you want
plan = df[df['SOAP'] == 'Plan']['TUI'].tolist() # Assuming 'TUI' is the column you want

In [24]:
def assign_soap_sections(annotations):
    section_mapping = {
        "Subjective": tuple(subjective),  # Signs or Symptoms
        "Objective": tuple(objective),  # Findings or Lab Results
        "Assessment": tuple(assessment),  # Diseases or Syndromes
        "Plan": tuple(plan),  # Drugs or Therapies
    }

    for annotation in annotations:
        # Assign default section
        annotation["section"] = "Uncategorized"

        # Match semantic types to SOAP sections
        for section, semantic_types in section_mapping.items():
            if any(tui in semantic_types for tui in annotation["semantic_types"]):
                annotation["section"] = section
                break
    return annotations

# annotated_with_sections = assign_soap_sections(annotations)



In [30]:
def annotate_example(example):
    """Annotate input_text with UMLS entities and SOAP sections."""
    # Process text with the pipeline
    doc = nlp(example["input_text"])

    # Extract UMLS entities
    annotations = extract_umls_entities(doc)

    # Assign SOAP sections
    annotated_with_sections = assign_soap_sections(annotations)

    #simplify annotations to only necessary key value pairs for fine-tuning
    #purpose is to save input token length
    filtered_annotations = [
        {
            "entity": annotation["entity"],
            "section": annotation["section"]
        }
        for annotation in annotated_with_sections
    ]

    # Add the annotations to the example
    # example["umls_annotations"] = annotated_with_sections
    example["umls_annotations"] = filtered_annotations
    return example

In [38]:
# Annotate the entire DatasetDict
annotated_dataset = dataset.map(annotate_example, batched=False)

Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [39]:
annotated_dataset.save_to_disk("/content/drive/My Drive/annotated_medical_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/9250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

##3. DistilBART Fine-Tuning

In [42]:
from transformers import BartForConditionalGeneration, BartTokenizer
model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [43]:
# Sliding window chunking function
%%time
def sliding_window_chunking(text, tokenizer, max_length=900, stride=256):
    """
    Split a long text into overlapping chunks using a sliding window.

    Args:
        text (str): The input text to be chunked.
        tokenizer (Tokenizer): The tokenizer to calculate token lengths.
        max_length (int): Maximum length of each chunk.
        stride (int): Overlap between consecutive chunks.

    Returns:
        List[str]: A list of text chunks.
    """
    # Tokenize the entire text
    tokens = tokenizer(text, truncation=False, return_tensors="pt")["input_ids"][0]

    # Break into chunks
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_length, len(tokens))
        chunk = tokens[start:end]

        # Convert back to text for chunk processing
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        chunks.append(chunk_text)

        # Move the window
        start += max_length - stride

    return chunks


# Updated preprocessing function
def preprocess_data_with_chunks(batch):
    """
    Preprocess data by chunking long input_texts, tokenizing them, and pairing each chunk
    with the target_text.
    """
    # Initialize lists for batched output
    input_ids = []
    attention_mask = []
    labels = []

    for input_text, target_text in zip(batch["input_text"], batch["target_text"]):
        # Generate chunks for input_text
        chunks = sliding_window_chunking(input_text, tokenizer, max_length=900, stride=256)

        for chunk in chunks:
            # Tokenize the chunk
            tokenized_input = tokenizer(chunk, max_length=900, truncation=True, padding="max_length")
            tokenized_target = tokenizer(target_text, max_length=600, truncation=True, padding="max_length")

            # Append tokenized data
            input_ids.append(tokenized_input["input_ids"])
            attention_mask.append(tokenized_input["attention_mask"])
            labels.append(tokenized_target["input_ids"])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_data_with_chunks, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

CPU times: user 3min 56s, sys: 745 ms, total: 3min 57s
Wall time: 3min 56s


In [44]:
from transformers import Seq2SeqTrainingArguments

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_distilbart",  # Directory to save checkpoints
    num_train_epochs=3,                # Number of training epochs
    per_device_train_batch_size=4,     # Adjust batch size for memory
    gradient_accumulation_steps=8,     # Accumulate gradients for larger effective batch size
    eval_strategy="steps",       # Evaluate after a certain number of steps
    eval_steps=500,                    # Perform evaluation every 500 steps
    save_steps=500,                    # Save model checkpoint every 500 steps
    save_total_limit=2,                # Keep only the 2 most recent checkpoints
    learning_rate=5e-5,                # Learning rate
    predict_with_generate=True,        # Generate text during evaluation
    fp16=True,                         # Use mixed precision to save memory
    logging_dir="./logs_distilbart",   # Directory for logs
    logging_steps=100,                 # Log every 100 steps
)

In [45]:
%%time
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

#start training
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,0.8765,0.842138
1000,0.764,0.794083




CPU times: user 16min 38s, sys: 13.3 s, total: 16min 52s
Wall time: 17min 39s


TrainOutput(global_step=1242, training_loss=0.9506807189056838, metrics={'train_runtime': 1051.9219, 'train_samples_per_second': 37.816, 'train_steps_per_second': 1.181, 'total_flos': 5.40705040367616e+16, 'train_loss': 0.9506807189056838, 'epoch': 2.997285067873303})

In [46]:
#saving model to Google Drive
import shutil

# Save the model and tokenizer locally
model.save_pretrained("./fine_tuned_distilbart_umls", safe_serialization=True)
tokenizer.save_pretrained("./fine_tuned_distilbart_umls")

# Path where the model is saved locally
local_model_path = "/content/fine_tuned_distilbart_umls"  # Corrected this path to match the save_pretrained directory

# Path in Google Drive where you want to save the model
drive_model_path = "/content/drive/My Drive/DistilBARTFolder_UMLS"

# Copy the entire directory to Google Drive
shutil.copytree(local_model_path, drive_model_path)

print("Model directory uploaded to Google Drive!")

Model directory uploaded to Google Drive!


##4. Model Evaluation

In [47]:
#load model from drive
from transformers import BartForConditionalGeneration, BartTokenizer
model = BartForConditionalGeneration.from_pretrained("/content/drive/My Drive/DistilBARTFolder_UMLS")
tokenizer = BartTokenizer.from_pretrained("/content/drive/My Drive/DistilBARTFolder_UMLS")




In [48]:
def generate_prediction(input_text):
    '''function to generate predictions from input_text, this case text dataset.
    This will be used for human evaluation'''
    inputs = tokenizer(input_text, return_tensors="pt", max_length=900, truncation=True, padding=True)

    # Move input tensors to the same device as the model
    input_ids = inputs["input_ids"].to(model.device)  # Assuming model is on GPU

    outputs = model.generate(
        input_ids,
        max_length=512,  # Adjust based on your expected output length
        do_sample=True,  # Enable sampling
        top_k=50,        # Top-k sampling for diversity
        top_p=0.95,      # Nucleus sampling
        temperature=1.0, # Controls randomness
        num_beams=4,     # Beam search for better predictions
        length_penalty=2.0,
        early_stopping=True
    )
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

In [49]:
#measure entire cell process time
%%time
import random

# Set a random seed for reproducibility
random.seed(42)
# Number of samples to display
num_samples = 10

# Select random samples from the Hugging Face Dataset
# random_samples = dataset["test"].shuffle(seed=42).select(range(num_samples))
subset_samples = dataset["test"].select(range(num_samples))

#empty list to save outputs, will be used to export to Google Drive below
results = []

# Generate predictions and print results
for idx, row in enumerate(subset_samples):
    input_text = row["input_text"]
    target_text = row["target_text"]
    prediction = generate_prediction(input_text)

    # print(f"\nSample {idx + 1}:")
    # print(f"Input Text:\n{input_text}\n")
    # print(f"Target Text (Ground Truth):\n{target_text}\n")
    # print(f"Model Prediction:\n{prediction}\n")
    # print("-" * 80)

    # Append to results
    results.append({
        "Input Text": input_text,
        "Target Text": target_text,
        "Model Prediction": prediction
    })

    # Optional: Print progress
    print(f"Processed Sample {idx + 1}/{len(subset_samples)}")




Processed Sample 1/10
Processed Sample 2/10
Processed Sample 3/10
Processed Sample 4/10
Processed Sample 5/10
Processed Sample 6/10
Processed Sample 7/10
Processed Sample 8/10
Processed Sample 9/10
Processed Sample 10/10
CPU times: user 17min 37s, sys: 5.92 s, total: 17min 43s
Wall time: 2min 57s


In [50]:
# Convert results to a DataFrame
df = pd.DataFrame(results)

# Define the output path in Google Drive
output_path = "/content/drive/My Drive/DistilBART_UMLS_baseHEval.csv"

# Save to CSV
df.to_csv(output_path, index=False)

print(f"Results saved to {output_path}")

Results saved to /content/drive/My Drive/DistilBART_UMLS_baseHEval.csv


##4.1 ROUGE Evaluation

In [51]:
#load rouge metric
rouge = load("rouge") #lrouge metric using load function
#gpu to local device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#move model to local
model.to(device)

#generate predictions and references
predictions = []
references = []
for row in dataset["test"]:  # Iterate through the dataset
    input_text = row["input_text"]
    target_text = row["target_text"]  # Extract the target text
    prediction = generate_prediction(input_text)
    predictions.append(prediction)
    references.append(target_text)

#calcuating rouge score
rouge_results = rouge.compute(predictions=predictions, references=references)
print("ROUGE Results:", rouge_results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Results: {'rouge1': 0.6741695408337932, 'rouge2': 0.4307359804407936, 'rougeL': 0.5123549412822074, 'rougeLsum': 0.6046393749622261}


##4.2 BLEU Evaluation

In [52]:
# ipython-input-16-341afa8f0746
#use generate_prediction function
# predictions, references = generate_predictions(dataset["test"], model, tokenizer, device)
predictions = []
references = []
for row in dataset["test"]:
  prediction = generate_prediction(row['input_text'])
  predictions.append(prediction)
  references.append(row['target_text'])


#BLEU expects references as a list of lists
references = [[ref] for ref in references]

#calculating BLEU score
bleu_score = sacrebleu.corpus_bleu(predictions, references)
print("BLEU Score:", bleu_score.score)

BLEU Score: 72.2443582522423


##4.3 BERT Score Evaluation

In [53]:
#BERT Score function
def evaluate_bertscore(predictions, references, lang="en"):
    # Calculate BERTScore
    P, R, F1 = bert_score.score(predictions, references, lang=lang, verbose=True)

    # Return precision, recall, and F1 as average scores
    return {
        "BERTScore Precision": P.mean().item(),
        "BERTScore Recall": R.mean().item(),
        "BERTScore F1": F1.mean().item()
    }

print(evaluate_bertscore(predictions, references))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 8.50 seconds, 29.42 sentences/sec
{'BERTScore Precision': 0.920655369758606, 'BERTScore Recall': 0.9217320084571838, 'BERTScore F1': 0.9211450219154358}


## Memory Check

In [41]:
import gc
gc.collect()

185

In [40]:
# Install psutil if not already available
!pip install psutil

import psutil

# Check RAM usage
ram_usage = psutil.virtual_memory()
print(f"Total: {ram_usage.total / 1e9:.2f} GB, Used: {ram_usage.used / 1e9:.2f} GB, Available: {ram_usage.available / 1e9:.2f} GB")

Total: 89.63 GB, Used: 13.91 GB, Available: 74.78 GB


In [None]:
%%time

# Process a sample text
doc = nlp(
    "Spinal and bulbar muscular atrophy (SBMA) is an "
    "inherited motor neuron disease caused by the expansion "
    "of a polyglutamine tract within the androgen receptor (AR). "
    "SBMA can be caused by this easily."
)

# Check if there are any entities in the document
if doc.ents:
    # Select a random entity (e.g., the second entity, if it exists)
    entity = doc.ents[1] if len(doc.ents) > 1 else doc.ents[0]
    print("Name:", entity)

    # Get the linker from the pipeline
    linker = nlp.get_pipe("scispacy_linker")

    # Display UMLS links for the entity
    if entity._.kb_ents:
        for umls_ent in entity._.kb_ents:
            cui = umls_ent[0]
            score = umls_ent[1]
            umls_entity = linker.kb.cui_to_entity[cui]
            print(f"CUI: {cui}, Name: {umls_entity.canonical_name}")
            print(f"Definition: {umls_entity.definition}")
            print(f"TUI(s): {', '.join(umls_entity.types)}")
            print(f"Aliases (total: {len(umls_entity.aliases)}):")
            print(", ".join(umls_entity.aliases[:5]), "...")
    else:
        print("No UMLS links found for this entity.")
else:
    print("No entities found in the text.")

In [None]:
# Process a sample text
doc = nlp(
    "Spinal and bulbar muscular atrophy (SBMA) is an "
    "inherited motor neuron disease caused by the expansion "
    "of a polyglutamine tract within the androgen receptor (AR). "
    "SBMA can be caused by this easily."
)



# Check if there are any entities in the document
if doc.ents:
    # Select a random entity (e.g., the second entity, if it exists)
    entity = doc.ents[1] if len(doc.ents) > 1 else doc.ents[0]
    print("Name:", entity)

    # Get the linker from the pipeline
    linker = nlp.get_pipe("scispacy_linker")

    # Display UMLS links for the entity
    if entity._.kb_ents:
        for umls_ent in entity._.kb_ents:
            cui = umls_ent[0]
            score = umls_ent[1]
            umls_entity = linker.kb.cui_to_entity[cui]
            print(f"CUI: {cui}, Name: {umls_entity.canonical_name}")
            print(f"Definition: {umls_entity.definition}")
            print(f"TUI(s): {', '.join(umls_entity.types)}")
            print(f"Aliases (total: {len(umls_entity.aliases)}):")
            print(", ".join(umls_entity.aliases[:5]), "...")
    else:
        print("No UMLS links found for this entity.")
else:
    print("No entities found in the text.")

In [None]:
# Iterate over all entities
for entity in doc.ents:
    print(f"Entity: {entity.text}")
    # Check for UMLS links
    if entity._.kb_ents:
        for umls_ent in entity._.kb_ents:
            cui = umls_ent[0]
            score = umls_ent[1]
            umls_entity = linker.kb.cui_to_entity[cui]
            print(f"  - CUI: {cui}, Name: {umls_entity.canonical_name}")
            print(f"    Definition: {umls_entity.definition}")
            print(f"    TUI(s): {', '.join(umls_entity.types)}")
    else:
        print("  No UMLS links found for this entity.")