 Embeddings with Fine-Tuned BERT 

In [1]:
import pandas as pd
import ast
import numpy as np

import seaborn as sns
import pylab as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics
from transformers import AutoConfig
from transformers import AutoTokenizer, AutoModel
import openai
from datasets import Dataset
import torch 
import os
import ragas
from ragas import evaluate
from IPython.display import display, Markdown

  from pandas.core import (


In [5]:
data = pd.read_csv('/Users/ruxandramihai/Desktop/RA/UK DRI Publications/embeddings.csv')
#data['embeddings'] = data['embeddings'].apply(ast.literal_eval)

In [7]:
def load_embeddings():
    data = pd.read_csv('/Users/ruxandramihai/Desktop/embeddings.csv')
    data['embeddings'] = data['embeddings'].apply(ast.literal_eval)
    # drop cluster 1: all that have Erratum or Correction in Title
    embeddings = np.vstack(data['embeddings'].values)
    return data,embeddings

In [10]:
model_config = AutoConfig.from_pretrained("/Users/ruxandramihai/Desktop/LLM/fine-tuned-pubmedBERT_5_epochs/config.json")
model = AutoModel.from_pretrained("/Users/ruxandramihai/Desktop/LLM/fine-tuned-pubmedBERT_5_epochs", config = model_config)
tokenizer = AutoTokenizer.from_pretrained("/Users/ruxandramihai/Desktop/LLM/fine-tuned-pubmedBERT_5_epochs") 

Some weights of BertModel were not initialized from the model checkpoint at /Users/ruxandramihai/Desktop/LLM/fine-tuned-pubmedBERT_5_epochs and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def get_query_embedding(query):
    inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state.mean(dim=1)
    return query_embedding

In [11]:
# find similar documents 
def get_kclosest(query_embedding, k=5):
    data, all_embeddings = load_embeddings()
    if isinstance(query_embedding, torch.Tensor):
        query_embedding = query_embedding.numpy()
    if isinstance(all_embeddings, torch.Tensor):
        all_embeddings = all_embeddings.numpy()
    similarities = cosine_similarity(query_embedding, all_embeddings)
    closest_indices = np.argsort(similarities[0])[-k:][::-1]
    return data.iloc[closest_indices]

In [23]:
limit = 5000

def get_context(query, k=5):
    query_embedding = get_query_embedding(query)
    closest_docs = get_kclosest(query_embedding, k)
    contexts = [doc.Abstract for _, doc in closest_docs.iterrows()]
    #dois = closest_docs.DOI.tolist()
    #return dois, contexts
    return contexts

def retrieve(query,k=3):
    dois ,contexts = get_context(query,k)

    # build our prompt with the retrieved contexts included
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    for i in range(1, len(contexts)):
        if len("\n\n---\n\n".join(contexts[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts[:i-1]) +
                prompt_end
            )
            break
        elif i == len(contexts)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts) +
                prompt_end
            )
    return prompt

def response(prompt):
    messages = [
        {"role": "user", "content": prompt}
    ]
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=150,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0)
    return response.choices[0].message.content.replace("\n", "")

def display(dois):
    text = f"\n\n Here are publications from UKDRI which are most relevant to your question: {', '.join(dois.values)}"
    return text

Check quality of answer and retrieved context - BERT

In [25]:
test_data = pd.read_csv('/Users/ruxandramihai/Desktop/test_data_QnA_predictions.csv')

In [None]:
for doi,row in test_data.iterrows():
    test_data.at[doi,'contexts'] = get_context(row.question)

In [29]:
test_data = test_data.drop(columns=['contexts'])

In [39]:
test_data.contexts[0]

(['10.1016/j.mcn.2018.12.004',
  '10.1016/j.neurobiolaging.2022.04.009',
  '10.1002/dad2.12167',
  '10.1186/s13024-021-00430-x',
  '10.2217/bmm-2017-0433'],
 ["Alzheimer's disease (AD) is characterized by amyloid plaques and tau pathology (neurofibrillary tangles and neuropil threads). Amyloid plaques are primarily composed of aggregated and oligomeric β-amyloid (Aβ) peptides ending at position 42 (Aβ42). The development of fluid and PET biomarkers for Alzheimer's disease (AD), has allowed for detection of Aβ pathology in vivo and marks a major advancement in understanding the role of Aβ in Alzheimer's disease (AD). In the recent National Institute on Aging and Alzheimer's Association (NIA-AA) Research Framework, AD is defined by the underlying pathology as measured in patients during life by biomarkers (Jack et al., 2018), while clinical symptoms are used for staging of the disease. Therefore, sensitive, specific and robust biomarkers to identify brain amyloidosis are central in AD re

In [32]:
test_data['ground_truth'] = test_data['ground_truths'].apply(lambda x: ' '.join(x))
test_data['contexts'] = ''

for doi,row in test_data.iterrows():
    test_data.at[doi,'contexts'] = get_context(row.question)

In [36]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)
# split test_data into smaller chunks
import time
n=5
delay_between_requests = 5
chunks = [test_data[i:i + n] for i in range(0, len(test_data), n)]  # n is the size of each chunk

results = []
for c,chunk in enumerate(chunks[1:]):
    print(c)
    chunk = Dataset.from_dict(chunk)
    result = evaluate(
        dataset=chunk, 
        metrics=[
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ]
    )
    results.append(result.to_pandas())
    time.sleep(delay_between_requests)  # Implement delay as calculated
# combine results from all chunks
final_results = pd.concat(results)

0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


1


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

2


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

3


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

4


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

5


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

6


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

7


Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

8


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

In [38]:
final_results.iloc[:,-4:].agg(['mean','std','min','max'])

Unnamed: 0,context_precision,context_recall,faithfulness,answer_relevancy
mean,0.736837,0.602273,0.208712,0.924361
std,0.343289,0.464802,0.367836,0.15251
min,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0
