In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# MedQA Dataset parsing for English data

In [32]:
import json
import numpy as np
import torch

def read_question_answer_file(file_path):
    """Reads a JSONL file with question-answer data and returns a list of dictionaries."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))  # Parse each line as JSON
    return data

# Load your dataset
dataset_path = r'C:\Users\ranad\OneDrive - University of Glasgow\Attachments\Msc Final Year project\Data\MedQA-USMLE-4-options\phrases_no_exclude_train.jsonl'  # Replace with the path to your JSON file
questions_data = read_question_answer_file(dataset_path)

In [33]:
print(questions_data[0]['question'])
print(questions_data[0]['answer_idx'])
print(questions_data[0]['options'])

A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?
D
{'A': 'Ampicillin', 'B': 'Ceftriaxone', 'C': 'Doxycycline', 'D': 'Nitrofurantoin'}


# Pubmed Data Crawling

In [31]:
# !pip install biopython

<h2>Fetching the pmids for the query options using entrez</h2>

In [5]:
from Bio import Entrez

# Replace with your email address
Entrez.email = "2935352R@student.gla.ac.uk"
access_key_id = "18acd1db794f7de35c1c83811bc106c6a509"

def fetch_pmids(query):
  """Fetches PMIDs for a given search term.

  Args:
    search_term: The search term to query Entrez.

  Returns:
    A list of PMIDs.
  """
  word = query.split()
#   if len(word) == 1:
#     query = query+" top 10 records"
  handle = Entrez.esearch(db="pubmed",sort="relevance", term=query, retmax=20,api_key=access_key_id)  # Adjust retmax as needed
  record = Entrez.read(handle)
  handle.close()

  pmids = record["IdList"]
  return pmids

# Example usage:
search_term = "Ceftriaxone"
pmids = fetch_pmids(search_term)
print(pmids)

['2210825', '1918224', '3910386', '3073046', '11985490', '27488586', '36208418', '37486410', '3906584', '37474675', '1918222', '1918223', '1918221', '29377708', '22761158', '19496200', '1486190', '15151561', '3433495', '35703558']


<h2>Fetching full text articles for a given PMID</h2>

In [30]:
import requests

def fetch_bioc_data(pmid):
  """Fetches BioC data for a list of PMIDs.

  Args:
    pmids: A list of PMIDs.

  Returns:
    A list of JSON objects, each representing a BioC document.
  """

  #url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pubmed.cgi/BioC_json/{pmid}/unicode"
  url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmid}/unicode"
  response = requests.get(url)
  if response.status_code == 200:
     try:
      bioc_data = response.json()
     except Exception:
      #print(f"Error decoding BioC data for PMID {pmid} (might be non-JSON)")
      bioc_data = None  # Or handle empty data differently
  else:
     print(f"Error fetching BioC data for PMID {pmid}: {response.status_code}")

  return bioc_data

# # Example usage
# pmid = "35703558"
# bioc_data = fetch_bioc_data(pmid)
# print(bioc_data)

<h2>Data pre-processing and extraction</h2>

In [7]:
import re

def clean_text(text):
  """Removes special characters and extra whitespace from text.

  Args:
    text: The input text to be cleaned.

  Returns:
    The cleaned text.
  """
#   special_chars = r"[\u03b2\u00b5\u03b4\u03c5\u03bb\u0394\u00f6]"
#   # Remove special characters using regular expression
#   text = re.sub(r"[^\w\s]", "", text)
# Remove special characters, but keep letters, digits, and single spaces
  cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Replace multiple spaces with a single space
  cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    # Strip leading and trailing spaces (if any)
  cleaned_text = cleaned_text.strip()
  return cleaned_text

#   # Remove extra whitespace
#   text = " ".join(text.split())

#   return text

In [8]:
def save_string_to_file(data, filename):
  """Saves a string to a text file.

  Args:
    text: The string to be saved.
    filename: The name of the file to create.
  """

  with open(filename, "w",encoding='utf-8') as f:
    json.dump(data, f)

# # Example usage:
# my_string = "This is the text I want to save."
# save_string_to_file(my_string, "output.txt")

In [10]:
def sanitize_filename(filename):
  """Sanitizes a filename by replacing special characters with underscores.

  Args:
    filename: The original filename.

  Returns:
    The sanitized filename.
  """

  # Replace non-alphanumeric characters with underscores
  filename = re.sub(r'[^\w]', '_', filename)

  # Remove leading and trailing underscores
  filename = filename.strip('_')

  return filename

In [11]:
import json

def extract_text(data):
  """Extracts text from the given JSON data.

  Args:
    data: The JSON data.

  Returns:
    A list of text strings.
  """

  texts = []
  for document in data:
    for passage in document['documents'][0]['passages']:  # Access the first document's passages
        words = passage['text'].split()
        if(len(words)> 8):
          texts.append(clean_text(passage['text']))
  return texts

# Example usage:
# Assuming you have the JSON data stored in a variable named 'json_data'
extracted_texts = extract_text(bioc_data)
print(extracted_texts)

['Penicillin plus Ceftriaxone versus Ampicillin plus Ceftriaxone Synergistic Potential against Clinical Enterococcus faecalis Blood Isolates', 'Penicillin plus ceftriaxone is a promising alternative to ampicillin plus ceftriaxone for the treatment of Enterococcus faecalis infective endocarditis Limited data is available supporting the utilization of penicillin plus ceftriaxone A total of 20 E faecalis isolates one wildtype strain JH22 and 19 clinical blood strains were assessed for penicillin plus ceftriaxone and ampicillin plus ceftriaxone synergy using a 24h timekill experiment Susceptibility was determined by broth microdilution Differences in bactericidal bacteriostatic or inactivity as well as synergy between treatments were assessed by chisquare or Fisher exact test All E faecalis isolates were considered susceptible to ampicillin and penicillin Ampicillin plus ceftriaxone versus penicillin plus ceftriaxone similarly demonstrated synergy Bactericidal activity was more commonly ob

<h2>Saving indivisual options data to json file </h2>

In [93]:
# Iterating a for loop to fetch the data.
import time

from tqdm import tqdm

questionLoader = questions_data[327:1000]
for question in  tqdm(questionLoader):
    options = question['options']
    for key, value in options.items():
        query = value
        bioc_data = []
        pmids = fetch_pmids(query)
        counter = 0
        for pmid in pmids:
            data = fetch_bioc_data(pmid)
            if data != None:
                counter +=1
                bioc_data.append(extract_text(data))
            time.sleep(0.3)
            if counter == 5 : break
        save_string_to_file(bioc_data, "C:/Users/ranad/Documents/Pubmed/" + sanitize_filename(query) + ".json")

<h2>Generate bert embeddings</h2>

In [22]:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel

# Initialize the tokenizer and model (bert-base-uncased)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Ensure the model runs on GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)


# Function to compute embeddings
def get_embeddings(texts,tokenizer,model):
    
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():  # No need to compute gradients
        outputs = model(**inputs)

    last_hidden_state = outputs.last_hidden_state
    embeddings = last_hidden_state.mean(dim=1)

    # Compute the mean of the token embeddings to get a fixed-size representation
    embeddings = embeddings.squeeze().cpu().numpy()  # (batch size x hidden size)

    return embeddings



<h2>Dictionary format data</h2>

In [17]:
filtered_data = questions_data[200:327]

# Path to the directory containing JSON files
pubmed_dir = 'C:/Users/ranad/Documents/Pubmed/'

# Dictionary to store the original data (not embeddings) by file key


def extract_dataDict(directory):
    data_dict = {}
    # Iterate through each JSON file in the directory
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r') as file:
            data = json.load(file)
            file_key = sanitize_filename(filename[:-4])  
            data_dict[file_key] = data
    return data_dict



In [18]:
data_dict = extract_dataDict(pubmed_dir)

In [19]:
data_dict['Arthralgias']

[['Osteoarthritis is the most common form of arthritis and a leading cause of disability worldwide largely due to pain the primary symptom of the disease The pain experience in knee osteoarthritis in particular is wellrecognized as typically transitioning from intermittent weightbearing pain to a more persistent chronic pain Methods to validly assess pain in osteoarthritis studies have been developed to address the complex nature of the pain experience The etiology of pain in osteoarthritis is recognized to be multifactorial with both intraarticular and extraarticular risk factors Nonetheless greater insights are needed into pain mechanisms in osteoarthritis to enable rational mechanismbased management of pain Consequences of pain related to osteoarthritis contribute to a substantial socioeconomic burden',
  'The hallmark symptom of osteoarthritis OA the most common form of arthritis is pain This is the symptom that drives individuals to seek medical attention and contributes to functi

<h2>Document text arrangement to search using FIASS Index.</h2>

In [28]:
medQA_Filtered_data = []

for key, data in tqdm(data_dict.items()):
        for texts in data:
            for text in texts:
                if len(text.split())>10:
                    medQA_Filtered_data.append(clean_text(text))

100%|████████████████████████████████████████████████████████████████████████████████| 503/503 [00:05<00:00, 90.23it/s]


In [29]:
len(medQA_Filtered_data)

129477

In [21]:
# Iterate through the data dictionary and generate embeddings
# Now, convert the original text data in `data_dict` to embeddings
# Dictionary to store embeddings by file key
from tqdm import tqdm

def embeddings_format(data_dict):
    embeddings_dict = {}
    embeddings = []
    for key, data in tqdm(data_dict.items()):
        embeddings_dict[key] = []
        for array in data:
            for text in array:
                embedding = get_embeddings(text,bert_tokenizer, bert_model)
#                 embeddings_dict[key].append(embedding)
                embeddings.append(embedding)
    return embeddings_dict,embeddings
                

file_dict_embeddings,fiass_embeddings = embeddings_format(data_dict)

  0%|                                                                                          | 0/503 [00:00<?, ?it/s]Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keywo

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': True} not recognized.
Keyword arguments {'clean_up_tokenization_spaces': 

KeyboardInterrupt: 

In [122]:
file_dict_embeddings

{'194_200_json': []}

# FIASS Installation and Storage

In [25]:
# !pip install faiss-cpu 
#pip install faiss-gpu  # For GPU support, if you have a CUDA-capable GPU

In [None]:
import faiss

# Get the dimensionality of the embeddings
embedding_dim = embeddings.shape[1]

# Create a FAISS index
# IndexFlatL2 is a simple, exact nearest neighbor search index with L2 distance
index = faiss.IndexFlatL2(embedding_dim)

# Add embeddings to the index
index.add(embeddings)

# Check the number of vectors in the index
print(f"Number of vectors in the index: {index.ntotal}")

# Save the index to a file for later use
faiss.write_index(index, 'FAISS\QA200_327_index.index')




In [None]:
# Load the FAISS index
faiss_index = faiss.read_index('FAISS\QA200_327_index.index')

# Function to search the FAISS index
def search_faiss_index(query, index, tokenizer, model, top_k=5):
    # Compute the query embedding
    query_embedding = get_embeddings(query, tokenizer, model)

    # Reshape the query embedding to match FAISS expected input format
    query_embedding = np.expand_dims(query_embedding, axis=0).astype('float32')

    # Search the FAISS index
    distances, indices = index.search(query_embedding, top_k)

    return distances, indices

In [None]:
# Example usage
query = "Sample query text"
top_k = 5

distances, indices = search_faiss_index(query, index, bert_tokenizer, bert_model, top_k=top_k)

# Print results
print(f"Top-{top_k} results:")
for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
    print(f"Result {i + 1}:")
    print(f"Index: {idx}, Distance: {distance}")
    # If you have the original text stored, you can retrieve it like this:
    # print(f"Text: {embeddings_dict['some_key'][idx]}")

# Library installations

In [8]:
# !pip install langchain
# !pip install transformers
# !pip install accelerate
# !pip install bitsandbytes
# !pip install --upgrade pip
# !pip install --upgrade langchain
# !pip install langchain_community
# !pip list | grep langchain
# !pip list | grep langchain_community

# !pip install -U langchain-huggingface


Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.27 (from langchain)
  Downloading langchain_core-0.2.29-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.98-py3-none-any.whl.metadata (13 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.27->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting packaging<25,>=23.2 (from langchain-core<0.3.0,>=0.2.27->langchain)
  Downloading packaging-24.1-py3-none-any.whl.metadata (3.2 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.6-cp311-none-win_amd64.whl.metadata (51 kB)
Downloading langchain-0.2.12-py3-none-any.whl (990 kB)
   ------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.
tables 3.8.0 requires cython>=0.29.21, which is not installed.
python-lsp-black 1.2.1 requires black>=22.3.0, but you have black 0.0 which is incompatible.


Collecting sentencepiece (from transformers)
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   --------------------- ------------------ 524.3/991.5 kB 3.4 MB/s eta 0:00:01
   ---------------------------------------- 991.5/991.5 kB 3.9 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting huggingface-hub>=0.21.0 (from accelerate)
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.3.1 (from accelerate)
  Downloading safetensors-0.4.4-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.21.0->accelerate)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Downloading accelerat

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
s3fs 2023.4.0 requires fsspec==2023.4.0, but you have fsspec 2024.6.1 which is incompatible.


Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-win_amd64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-win_amd64.whl (136.5 MB)
   ---------------------------------------- 0.0/136.5 MB ? eta -:--:--
   ---------------------------------------- 0.3/136.5 MB ? eta -:--:--
   ---------------------------------------- 1.0/136.5 MB 2.6 MB/s eta 0:00:52
   ---------------------------------------- 1.3/136.5 MB 2.2 MB/s eta 0:01:01
    --------------------------------------- 2.1/136.5 MB 2.6 MB/s eta 0:00:52
    --------------------------------------- 2.6/136.5 MB 2.8 MB/s eta 0:00:48
    --------------------------------------- 2.9/136.5 MB 2.3 MB/s eta 0:00:58
    --------------------------------------- 3.4/136.5 MB 2.3 MB/s eta 0:00:59
   - -------------------------------------- 3.9/136.5 MB 2.3 MB/s eta 0:00:57
   - -------------------------------------- 4.2/136.5 MB 2.4 MB/s eta 0:00:56
   - -------------------------------------- 4.5/136.5 MB 2.3 MB/s eta

Collecting langchain_community
  Downloading langchain_community-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.21.3-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Downloading langchain_community-0.2.11-py3-none-any.whl (2.3 MB)
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.3 MB ? eta -:--:--
   --------- ------------------------------ 0.5/2.3 MB 985.5 kB/s eta 0:00:02
   -------

'grep' is not recognized as an internal or external command,
operable program or batch file.
'grep' is not recognized as an internal or external command,
operable program or batch file.


Collecting langchain-huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl.metadata (1.2 kB)
Collecting sentence-transformers>=2.6.0 (from langchain-huggingface)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting tokenizers>=0.19.1 (from langchain-huggingface)
  Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl.metadata (6.9 kB)
Collecting transformers>=4.39.0 (from langchain-huggingface)
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
Downloading langchain_huggingface-0.0.3-py3-none-any.whl (17 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.2 MB ? eta -:--:--
   ------------------ --------------------- 1.0/2.2 MB 2.5 MB/s eta 0:00:01
   ---------------------------- ----------- 1.6/2.2 MB 3.0 MB/s eta 0:00:

# Building the pipeline with the langchain

In [21]:
import os
import langchain

### prompts
from langchain import PromptTemplate, LLMChain

### models
# from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceInstructEmbeddings


import torch
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)


#model = "/kaggle/input/llama-2/pytorch/7b-chat-hf/1"
model = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"


tokenizer = AutoTokenizer.from_pretrained(model)

        
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
)

model_llama = AutoModelForCausalLM.from_pretrained(
    model,
    quantization_config = bnb_config,
    device_map = 'auto'
)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
question = questions_data[0]['question']
options = "\nA. Ampicillin\nB. Ceftriaxone\nC. Doxycycline\nD. Nitrofurantoin\n"

# Prompt design without a context

In [22]:

templateX = f"""Question: {question}[INST]Select the correct option only. No explanation required[/INST]

Options:{options}

#Answer:"""  # Force a single-line response


prompt_template = PromptTemplate(template=templateX, input_variables=["question", "options"])
promptX = prompt_template.format(question=question, options=options) 




In [28]:
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token 
inputs = tokenizer(promptX, return_tensors='pt', truncation=True, padding="max_length", max_length=1024).to(model_llama.device)
outputs = model_llama.generate(**inputs, max_new_tokens=1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?[INST]Select the correct option only. No explanation required[/INST]

Options:
A. Ampicillin
B. Ceftriaxone
C. Doxycycline
D. Nitrofurantoin


#Answer: D


In [26]:
position = response.find('#Answer:')
prediction = response[position+8 :position+10].strip()
prediction

''

# Prompt with context

In [None]:
# prompt generation
template_context = f"""Question: {question}
Context: {context}[INST]Select the correct option only. No explanation required[/INST]

Options: {options}

# Answer: """ # Force a single-line response

prompt_context = PromptTemplate(template=template_context, input_variables=["question", "options", "context"])


In [None]:
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token 
inputs = tokenizer(prompt_context, return_tensors='pt', truncation=True, padding="max_length", max_length=4000).to(model_llama.device)
outputs = model_llama.generate(**inputs, max_new_tokens=1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

In [None]:
position = response.find('#Answer:')
prediction = response[position+8 :position+10].strip()
prediction

# Designing a context summary prompt

In [None]:
# Use the summarization prompt to generate a summary
summarization_prompt = f"""[INST] Summarize the following text concisely:

{context_text}
[/INST]
"""

with torch.no_grad():
    summary_output = model_llama.generate(
        **tokenizer(summarization_prompt, return_tensors="pt").to(model_llama.device),
        max_new_tokens=128,
    )

# Decode and print the generated summary
summary_text = tokenizer.decode(summary_output[0], skip_special_tokens=True)
print(summary_text)

# Designing the data loader for parsing the data faster

In [29]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from tqdm import tqdm  # For progress bar

class QuestionAnswerDataset(Dataset):
    def __init__(self, questions_data):
        self.questionData = questions_data
    

    def __len__(self):
        return len(self.questionData)

    def __getitem__(self, idx):
        question_data = self.questionData[idx]
        question = question_data['question']
        options = question_data['options']
        options_str = "\n".join([f"{key}. {value}" for key, value in options.items()])
        answer = question_data['answer_idx']
        return question,options_str,answer

# Load your dataset
dataset = QuestionAnswerDataset(questions_data)
dataloader = DataLoader(dataset, batch_size=8, shuffle=False)  # Adjust batch_size as needed



# Data Loader with RAG using FAISS index

In [None]:
class QuestionAnswerDataset_RAG(Dataset):
    def __init__(self, questions_data, faiss_index, faiss_texts):
        self.questionData = questions_data
        self.faiss_index = faiss_index
        self.faiss_texts = faiss_texts
    

    def __len__(self):
        return len(self.questionData)

    def __getitem__(self, idx):
        question_data = self.questionData[idx]
        question = question_data['question']
        options = question_data['options']
        options_str = "\n".join([f"{key}. {value}" for key, value in options.items()])
        answer = question_data['answer_idx']

        # Search FAISS index for each option and retrieve the corresponding text
        retrieved_contexts = []
        for option_key, option_text in options.items():
            option_embedding = self.get_text_embedding(option_text)
            retrieved_indices = search_faiss_index(option_embedding, self.faiss_index)
            retrieved_context = " ".join([self.faiss_texts[i] for i in retrieved_indices])
            retrieved_contexts.append(retrieved_context)

        # Combine the contexts with the question and options for the final return
        combined_context = "\n".join(retrieved_contexts)

        return question, options_str, answer, combined_context

    def get_text_embedding(self, text):
        # Placeholder for the actual embedding generation logic
        # Replace this with the method to generate embeddings from text
        return np.random.rand(768)  # Example: Replace with actual embedding


faiss_texts = [...]  # Load the list of texts associated with the FAISS index

# Load your dataset
dataset_RAG = QuestionAnswerDataset_RAG(questions_data, faiss_index, faiss_texts)
dataloader_RAG = DataLoader(dataset, batch_size=8, shuffle=False)  # Adjust batch_size as needed

# Prompt design and generation

In [30]:
#prompt generation
template = f"""Question: {question}[INST]Select the correct option only. No explanation required[/INST]

Options:{options}

#Answer:"""  # Force a single-line response


prompt = PromptTemplate(template=template, input_variables=["question", "options"])

#llm_chain = LLMChain(llm=llm, prompt=prompt)

# Running the Batches

In [32]:
correct_predictions = 0
total_predictions = 0
responses = []
answers = []

for batch in tqdm(dataloader):
    questions, options_strs, answer_idxs = batch
    tokenizer.padding_side = 'left'
    tokenizer.pad_token = tokenizer.eos_token
    prompts = [prompt_context.format(question=question, options=options_str) for question, options_str in zip(questions, options_strs)]
    
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=1500).to(model_llama.device)
    
    with torch.no_grad():
        outputs = model_llama.generate(**inputs,max_new_tokens=1)
    
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    #print(decoded_outputs)
    for decoded_output, answer in zip(decoded_outputs, answer_idxs):
        position = decoded_output.find('#Answer:')
        answer_pred = decoded_output[position+8 :position+10].strip()
        #print(answer_pred)
        if answer == answer_pred.strip():
            correct_predictions += 1
        
        responses.append(answer_pred)
        answers.append(answer)
        total_predictions += 1
   


  0%|          | 0/1273 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/1273 [00:02<44:09,  2.08s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/1273 [00:04<43:52,  2.07s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 3/1273 [00:06<44:00,  2.08s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 4/1273 [00:08<43:29,  2.06s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 5/1273 [00:10<44:52,  2.12s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 6/1273 [00:13<48:50,  2.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 7/1273 [00:15<46:56,  2.22s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 8/1273 [00:17<46:16,  2

In [33]:
print(f"Accuracy: {correct_predictions / len(responses):.2%}")
correct_predictions


Accuracy: 65.18%


6634

# Running Batches with context

In [34]:
correct_predictions = 0
total_predictions = 0
responses = []
answers = []

for batch in tqdm(dataloader):
    questions, options_strs, answer_idxs, combined_contexts = batch
    tokenizer.padding_side = 'left'
    tokenizer.pad_token = tokenizer.eos_token
    prompts = [prompt_context.format(question=question, options=options_str, context=combined_context) for question, options_str,combined_context in zip(questions, options_strs,combined_contexts)]
    
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=4000).to(model_llama.device)
    
    with torch.no_grad():
        outputs = model_llama.generate(**inputs,max_new_tokens=1)
    
    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    #print(decoded_outputs)
    for decoded_output, answer in zip(decoded_outputs, answer_idxs):
        position = decoded_output.find('#Answer:')
        answer_pred = decoded_output[position+8 :position+10].strip()
        #print(answer_pred)
        if answer == answer_pred.strip():
            correct_predictions += 1
        
        responses.append(answer_pred)
        answers.append(answer)
        total_predictions += 1

10178