In [1]:
import pandas as pd
from google.colab import files
import numpy
import nltk
!pip install scikit-learn
import matplotlib
import spacy
import gensim
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



In [2]:
!pip install transformers

from transformers import BertTokenizer, BertForQuestionAnswering
import torch



In [3]:
# Load notes events data
d1 = files.upload()
notes = pd.read_csv('NOTEEVENTS0.csv', encoding='latin1')


Saving NOTEEVENTS0.csv to NOTEEVENTS0.csv


In [4]:
# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Handle missing values
notes.dropna(subset=['TEXT'], inplace=True)

# Remove irrelevant columns
notes.drop(columns=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME', 'STORETIME'], inplace=True)

# Preprocess the text data
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Apply text preprocessing to the 'TEXT' column
notes['TEXT'] = notes['TEXT'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
def search_notes(query):
    # Convert the query to lowercase
    query = query.lower()

    # Search for notes containing the query
    matching_notes = notes[notes['TEXT'].str.contains(query, case=False)]

    return matching_notes

In [6]:
# Get the user's search query
search_query = input("Enter your search query: ")

# Search for matching notes
search_results = search_notes(search_query)

# Display a preview of each matching note
print("Matching notes:")
for index, row in search_results.iterrows():
    note_id = index  # Use the DataFrame index as the note identifier
    note_preview = row['TEXT'][:100]  # Get the first 100 characters of the note
    print(f"Note ID: {note_id}")
    print(f"Preview: {note_preview}...")
    print()

# Prompt the user to select a note
selected_note_id = int(input("Enter the index of the note you want to select: "))

# Retrieve the selected note
selected_note = search_results.loc[selected_note_id]

if not selected_note.empty:
    selected_note_text = selected_note['TEXT']
    print(f"Selected note (Index: {selected_note_id}):")
    print(selected_note_text)
else:
    print(f"No note found with index: {selected_note_id}")

Enter your search query: diabetes
Matching notes:
Note ID: 6
Preview: new onset atrial fibrillation rapid ventricular response chief complaint palpitation mild shortness ...

Enter the index of the note you want to select: 6
Selected note (Index: 6):
new onset atrial fibrillation rapid ventricular response chief complaint palpitation mild shortness breath history present illness 70yearold male present 12hour history suddenonset palpitation mild dyspnea prior history atrial fibrillation cardiac disease patient denies chest pain dizziness syncope history hypertension managed medication type 2 diabetes mellitus physical examination finding vital sign show blood pressure 14090 mmhg heart rate irregularly irregular 150 bpm respiratory rate 18 breath per minute oxygen saturation 96 room air cardiovascular examination confirms irregularly irregular rhythm without murmur rub gallop lung clear auscultation diagnostic workup electrocardiogram ecg confirms atrial fibrillation rapid ventricular re

In [7]:
# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

def get_model_answer(model, question, passage, tokenizer):
    # Tokenize the input with truncation
    encoded_input = tokenizer(question, passage, return_tensors='pt', max_length=512, truncation=True)

    # Get the model's predictions
    with torch.no_grad():
        outputs = model(**encoded_input)

    # Extract the answer from the model's output
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = tokenizer.decode(encoded_input.input_ids[0][answer_start:answer_end])

    # Check if the answer is relevant
    if answer.strip() == '[CLS]':
        return "The question cannot be answered based on the given information."
    else:
        return answer



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Get the selected note text (assuming it is stored in a variable named 'selected_note_text')
passage = selected_note_text

while True:
    # Prompt the user to enter a question
    question = input("Enter a question related to the selected note (or type 'quit' to exit): ")

    # Check if the user wants to quit
    if question.lower() == 'quit':
        break

    # Get the model's answer
    answer = get_model_answer(model, question, passage, tokenizer)

    # Display the user's question and the generated answer
    print("Question:", question)
    print("Answer:", answer)
    print()

Enter a question related to the selected note (or type 'quit' to exit): What are the patient's vital signs, including blood pressure, heart rate, respiratory rate, and oxygen saturation?
Question: What are the patient's vital signs, including blood pressure, heart rate, respiratory rate, and oxygen saturation?
Answer: irregularly irregular 150 bpm respiratory rate 18 breath per minute

Enter a question related to the selected note (or type 'quit' to exit): Has the patient experienced any recent head trauma, infections, or changes in medications?
Question: Has the patient experienced any recent head trauma, infections, or changes in medications?
Answer: patient denies chest pain

Enter a question related to the selected note (or type 'quit' to exit): Does the patient have any history of cardiovascular risk factors such as hypertension, diabetes, hyperlipidemia, or smoking?
Question: Does the patient have any history of cardiovascular risk factors such as hypertension, diabetes, hyperlip