# MedQA Dataset parsing for English data

In [1]:
import pandas as pd 
import json
import numpy as np
import torch
import re
import os

def read_question_answer_file(file_path):
    """Reads a JSONL file with question-answer data and returns a list of dictionaries."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))  # Parse each line as JSON
    return data

# Load your dataset
dataset_path = r'med_data/phrases_no_exclude_train.jsonl' 
# dataset_path = r'C:\Users\ranad\OneDrive - University of Glasgow\Attachments\Msc Final Year project\Data\MedQA-USMLE-4-options\phrases_no_exclude_train.jsonl'
questions_data = read_question_answer_file(dataset_path)

In [2]:
print(questions_data[0]['question'])
print(questions_data[0]['answer_idx'])
print(questions_data[0]['options'])

A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?
D
{'A': 'Ampicillin', 'B': 'Ceftriaxone', 'C': 'Doxycycline', 'D': 'Nitrofurantoin'}


<h2>Data pre-processing and extraction</h2>

In [3]:
def sanitize_filename(filename):
  """Sanitizes a filename by replacing special characters with underscores.

  Args:
    filename: The original filename.

  Returns:
    The sanitized filename.
  """

  # Replace non-alphanumeric characters with underscores
  filename = re.sub(r'[^\w]', '_', filename)

  # Remove leading and trailing underscores
  filename = filename.strip('_')

  return filename

In [4]:
def read_json_file(file_path):
  """Reads data from a JSON file.

  Args:
    file_path: The path to the JSON file.

  Returns:
    The parsed JSON data as a Python object.
  """

  with open(file_path, 'r') as f:
    data = json.load(f)
  return data

# # Example usage:
# file_path = 'data.json'
# data = read_json_file(file_path)
# print(data)

In [5]:
def save_string_to_file(data, filename):
  """Saves a string to a text file.

  Args:
    text: The string to be saved.
    filename: The name of the file to create.
  """

  with open(filename, "w",encoding='utf-8') as f:
    json.dump(data, f)

# # Example usage:
# my_string = "This is the text I want to save."
# save_string_to_file(my_string, "output.txt")

<h2>Dictionary format data</h2>

In [6]:
# pubmed_dir = 'C:/Users/ranad/Documents/Pubmed_Full_text'
#pubmed_dir = 'Pubmed_Full_text'
pubmed_dir = 'Pubmed_Full_text'
# Dictionary to store the original data (not embeddings) by file key


def extract_dataDict(directory):
    data_dict = {}
    # Iterate through each JSON file in the directory
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            with open(filepath, 'r') as file:
                data = json.load(file)
                file_key = sanitize_filename(filename[:-4])  
                data_dict[file_key.lower()] = data
    return data_dict

data_dict = extract_dataDict(pubmed_dir)

In [7]:
#save_string_to_file(data_dict,"Processed data/Full_text_dict.json")
# save_string_to_file("Processed data/abstract_text_dict.json")
fulltext_dict = read_json_file("Processed data/Full_text_dict.json")
# abstract_dict = read_json_file("Processed data/abstract_text_dict.json")

In [8]:
print(len(fulltext_dict))
# sanitize_filename("Placing the infant in a supine position on a firm mattress while sleeping")
fulltext_dict["acute_myocardial_infarction"][0]
# # ctx = fulltext_dict[sanitize_filename("Acute_myocardial_infarction")][0]
# # truncate_to_words(".\n".join(ctx))
"Acute_Myocardial_Infarction".lower()

3627


'acute_myocardial_infarction'

<h2>Document text arrangement to search using FIASS Index.</h2>

In [6]:
# save_string_to_file(medQA_Filtered_data, "Processed data/Full_text_filtered.json")
# save_string_to_file(filtered_abstract_data, "Processed data/abstract_text_filtered.json")
# medQA_Filtered_fulltext = read_json_file("Processed data/Full_text_filtered.json")
# medQA_filtered_abstract = read_json_file("Processed data/abstract_text_filtered.json")

# Library installations

In [9]:
!pip install langchain
!pip install transformers
!pip install accelerate
!pip install bitsandbytes
!pip install --upgrade pip
!pip install --upgrade langchain
!pip install langchain_community
!pip list | grep langchain
!pip list | grep langchain_community

!pip install -U langchain-huggingface


Collecting langchain
  Downloading langchain-0.2.13-py3-none-any.whl.metadata (7.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.32-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.5 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-core<0.3.0,>=0.2.30 (from langchain)
  Downloading langchain_core-0.2.30-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.99-py3-none-any.whl.metadata (13 kB)
Collecting pydantic<3,>=1 (from langchain)
  Downloading pydantic-2.8.2-py3-none-a

# Building the pipeline with the langchain

In [10]:
import os
import langchain

### prompts
from langchain import PromptTemplate, LLMChain

### models
# from langchain.llms import HuggingFacePipeline
# from langchain.embeddings import HuggingFaceInstructEmbeddings


import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)


#model = "/kaggle/input/llama-2/pytorch/7b-chat-hf/1"
#model = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"
model = "meta-llama/Meta-Llama-3-8B"
#model = "Undi95/Meta-Llama-3-8B-hf"

tokenizer = AutoTokenizer.from_pretrained(model)

        
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

model_llama = AutoModelForCausalLM.from_pretrained(
    model,
    quantization_config = bnb_config,
    device_map = 'auto',
    token='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'
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# Designing a context summary prompt

In [14]:
question = questions_data[34]['question']
options = "\nA. Ampicillin\nB. Ceftriaxone\nC. Doxycycline\nD. Nitrofurantoin\n"

option_dict = questions_data[34]['options']

# Search dictionary index for each option and retrieve the corresponding text
retrieved_contexts = []
for option_key, option_text in option_dict.items():
   ctx = fulltext_dict.get(sanitize_filename(option_text).lower())
   if (ctx and len(ctx)>0) :
     retrieved_context = truncate_to_words("\n".join(ctx[0]),1000)
     retrieved_contexts.append(retrieved_context)

context = '\n'.join([s.strip() + '.' for s in retrieved_contexts])
context

'Comparison of clinical outcomes between aggressive and nonaggressive intravenous hydration for acute pancreatitis a systematic review and metaanalysis Current practice guidelines for optimal infusion rates during early intravenous hydration in patients with acute pancreatitis AP remain inconsistent This systematic review and metaanalysis aimed to compare treatment outcomes between aggressive and nonaggressive intravenous hydration in severe and nonsevere AP This study followed the Preferred Reporting Items for Systematic Reviews and MetaAnalyses guidelines We systematically searched PubMed Embase and Cochrane Library for randomized controlled trials RCTs on November 23 2022 and handsearched the reference lists of included RCTs relevant review articles and clinical guidelines We included RCTs that compared clinical outcomes from aggressive and nonaggressive intravenous hydration in AP Metaanalysis was performed using a randomeffects model for participants with severe AP and nonsevere A

In [59]:
# Use the summarization prompt to generate a summary
summarization_prompt = f"""[INST] Summarize the following text concisely:[/INST]
{context}
"""

with torch.no_grad():
    summary_output = model_llama.generate(
        **tokenizer(summarization_prompt, return_tensors="pt",truncation=True, padding="max_length", max_length=8000).to(model_llama.device),
        max_new_tokens=256,
    )

# Decode and print the generated summary
summary_text = tokenizer.decode(summary_output[0], skip_special_tokens=True)
summary_text = summary_text.replace("[INST] Summarize the following text concisely:[/INST]", "").strip()
print(summary_text)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Comparison of clinical outcomes between aggressive and nonaggressive intravenous hydration for acute pancreatitis a systematic review and metaanalysis Current practice guidelines for optimal infusion rates during early intravenous hydration in patients with acute pancreatitis AP remain inconsistent This systematic review and metaanalysis aimed to compare treatment outcomes between aggressive and nonaggressive intravenous hydration in severe and nonsevere AP This study followed the Preferred Reporting Items for Systematic Reviews and MetaAnalyses guidelines We systematically searched PubMed Embase and Cochrane Library for randomized controlled trials RCTs on November 23 2022 and handsearched the reference lists of included RCTs relevant review articles and clinical guidelines We included RCTs that compared clinical outcomes from aggressive and nonaggressive intravenous hydration in AP Metaanalysis was performed using a randomeffects model for participants with severe AP and nonsevere AP

In [16]:
# prompt generation
template_context = """
###Context###:\n {context} \n [INST]Answer the following below question using the Context.[/INST]\n
###Question###:\n {question} \n[INST]Select the correct option only. No explanation required[/INST]\n
Options: {options}

#Answer:""" # Force a single-line response

prompt_template_context = PromptTemplate(template=template_context, input_variables=["question", "options", "context"])
prompt_context = prompt_template_context.format(question=question, options=options, context=context) 

# # prompt generation
# template_context = """Question: {question}
# Context: {context}[INST]Select the correct option only. No explanation required[/INST]

# Options: {options}

# #Answer:""" # Force a single-line response

# prompt_template_context = PromptTemplate(template=template_context, input_variables=["question", "options", "context"])
# #prompt_context = prompt_template_context.format(question=question, options=options, context=context) 

In [17]:
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token 
inputs = tokenizer(prompt_context, return_tensors='pt', truncation=True, padding="max_length", max_length=8000).to(model_llama.device)
outputs = model_llama.generate(**inputs,max_new_tokens=1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



###Context###:
 Comparison of clinical outcomes between aggressive and nonaggressive intravenous hydration for acute pancreatitis a systematic review and metaanalysis Current practice guidelines for optimal infusion rates during early intravenous hydration in patients with acute pancreatitis AP remain inconsistent This systematic review and metaanalysis aimed to compare treatment outcomes between aggressive and nonaggressive intravenous hydration in severe and nonsevere AP This study followed the Preferred Reporting Items for Systematic Reviews and MetaAnalyses guidelines We systematically searched PubMed Embase and Cochrane Library for randomized controlled trials RCTs on November 23 2022 and handsearched the reference lists of included RCTs relevant review articles and clinical guidelines We included RCTs that compared clinical outcomes from aggressive and nonaggressive intravenous hydration in AP Metaanalysis was performed using a randomeffects model for participants with severe AP

In [18]:
position = response.find('#Answer:')
prediction = response[position+8 :position+10].strip()
prediction

'B'

# Data Loader with RAG using FAISS index

In [13]:
def truncate_to_words(text, max_words=4000):
  """Truncates a text string to a maximum number of words.

  Args:
    text: The input text string.
    max_words: The maximum number of words to keep.

  Returns:
    The truncated text string.
  """

  words = text.split()
  if len(words) <= max_words:
    return text
  else:
    truncated_text = ' '.join(words[:max_words])
    return truncated_text

In [23]:
# sanitize_filename("Placing the infant in a supine position on a firm mattress while sleeping")
# fulltext_dict["Acute_myocardial_infarction"]
# # ctx = fulltext_dict[sanitize_filename("Acute_myocardial_infarction")][0]
# # truncate_to_words(".\n".join(ctx))

In [19]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from tqdm import tqdm  # For progress bar

class QuestionAnswerDataset_RAG(Dataset):
    def __init__(self, questions_data, faiss_texts):
        self.questionData = questions_data
        self.faiss_texts = faiss_texts
    

    def __len__(self):
        return len(self.questionData)

    def __getitem__(self, idx):
        question_data = self.questionData[idx]
        question = question_data['question']
        options = question_data['options']
        options_str = "\n".join([f"{key}. {value}" for key, value in options.items()])
        answer = question_data['answer_idx']
        
        # Search FAISS index for each option and retrieve the corresponding text
        retrieved_contexts = []
        for option_key, option_text in options.items():
            # code for full text extarction.
            ctx = self.faiss_texts.get(sanitize_filename(option_text).lower())
            if (ctx and len(ctx)>0) :
                retrieved_context = truncate_to_words("\n".join(ctx[0]),1000)
                retrieved_contexts.append(retrieved_context)

        # Combine the contexts with the question and options for the final return
        combined_context = truncate_to_words("\n".join(retrieved_contexts))

        # # Use the summarization prompt to generate a summary
        # summarization_prompt = f"""[INST] Summarize the following text concisely:[/INST]
        # {combined_context}
        # """
        
        # with torch.no_grad():
        #     summary_output = model_llama.generate(
        #         **tokenizer(summarization_prompt, return_tensors="pt").to(model_llama.device),
        #         max_new_tokens=256,
        #     )
        
        # # Decode and print the generated summary
        # summary_text = tokenizer.decode(summary_output[0], skip_special_tokens=True)
        # summary_text = summary_text.replace("[INST] Summarize the following text concisely:[/INST]", "").strip()

        return question, options_str, answer, combined_context

    def get_text_embedding(self, text):
        # Placeholder for the actual embedding generation logic
        # Replace this with the method to generate embeddings from text
        return np.random.rand(768)  # Example: Replace with actual embedding



# Load your dataset
dataset_RAG = QuestionAnswerDataset_RAG(questions_data[:1000], data_dict)
dataloader_RAG = DataLoader(dataset_RAG, batch_size=1, shuffle=False)  # Adjust batch_size as needed

# Running Batches with context

In [None]:
correct_predictions = 0
total_predictions = 0
responses = []
answers = []

for batch in tqdm(dataloader_RAG):
    questions, options_strs, answer_idxs, combined_contexts = batch
    tokenizer.padding_side = 'left'
    tokenizer.pad_token = tokenizer.eos_token
    prompts = [prompt_template_context.format(question=question, options=options_str, context=combined_context) for question, options_str,combined_context in zip(questions, options_strs,combined_contexts)]
    
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=8000).to(model_llama.device)
    
    with torch.no_grad():
        outputs = model_llama.generate(**inputs,max_new_tokens=1)
    
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    #print(decoded_outputs)
    for decoded_output, answer in zip(decoded_outputs, answer_idxs):
        position = decoded_output.find('#Answer:')
        answer_pred = decoded_output[position+8 :position+10].strip()
        #print(answer_pred)
        if answer == answer_pred.strip():
            correct_predictions += 1
        
        responses.append(answer_pred)
        answers.append(answer)
        total_predictions += 1

  0%|          | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/1000 [00:01<24:59,  1.50s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/1000 [00:02<17:34,  1.06s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 3/1000 [00:03<15:17,  1.09it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 4/1000 [00:04<18:29,  1.11s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 5/1000 [00:05<15:55,  1.04it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 6/1000 [00:06<18:17,  1.10s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 7/1000 [00:07<18:12,  1.10s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 8/1000 [00:08<17:36,  1

In [23]:
print(f"Accuracy: {correct_predictions / len(responses):.2%}")
correct_predictions

Accuracy: 36.10%


361

In [21]:
print(responses.count(''))
responses.index('')
# question
responses
# filtered_list = [element for element in responses if element not in ['A', 'B', 'C', 'D']]
# filtered_list

35


['B',
 'C',
 'A',
 'C',
 'D',
 'B',
 'D',
 'B',
 'D',
 'B',
 'A',
 'A',
 'C',
 'C',
 'C',
 'C',
 'D',
 'A',
 'B',
 'B',
 'A',
 'C',
 'A',
 'C',
 'B',
 'C',
 'C',
 '',
 'B',
 'D',
 'A',
 'D',
 'D',
 'B',
 'C',
 'A',
 'B',
 'D',
 'C',
 'D',
 'D',
 'C',
 'D',
 'D',
 'C',
 'D',
 'D',
 'D',
 'D',
 'B',
 'D',
 'D',
 'C',
 'C',
 'B',
 'D',
 'A',
 'D',
 'D',
 'B',
 'A',
 'D',
 'A',
 'D',
 'A',
 'A',
 'A',
 'D',
 'C',
 'A',
 'D',
 'D',
 'D',
 'D',
 'B',
 'B',
 'A',
 'D',
 'A',
 'B',
 'C',
 'A',
 'C',
 'C',
 'D',
 'A',
 'D',
 'B',
 'A',
 'D',
 'A',
 'D',
 'A',
 'D',
 'A',
 'D',
 'B',
 'A',
 'D',
 'B',
 'D',
 'C',
 'C',
 'D',
 'A',
 'C',
 'D',
 'A',
 'D',
 'D',
 'D',
 '',
 'A',
 'B',
 '',
 'A',
 'A',
 'D',
 'B',
 'D',
 'C',
 'A',
 'D',
 'D',
 'A',
 'D',
 'B',
 'D',
 'D',
 'D',
 'B',
 'B',
 'D',
 'C',
 'C',
 'B',
 'D',
 'C',
 'B',
 'B',
 'D',
 'D',
 'C',
 'A',
 'D',
 'A',
 'D',
 'C',
 'D',
 'B',
 'B',
 'D',
 'A',
 '',
 'C',
 'C',
 'A',
 'C',
 'C',
 'B',
 'A',
 'A',
 'D',
 'C',
 'B',
 'B',
 'D',
 '

In [35]:
def free_gpu_cache():
  """Frees the GPU cache and memory."""
  if torch.cuda.is_available():
      with torch.cuda.device(0):
          torch.cuda.empty_cache()