In [None]:
import pandas as pd
from tqdm import tqdm
pd.set_option('max_colwidth', 10000)
pd.options.mode.chained_assignment = None
df_dev = pd.read_csv("../data/MedQA_dev.csv", index_col=0)
df_train = pd.read_csv("../data/MedQA_train.csv", index_col=0)


In [None]:
#using Azure endpoint:
import openai
import os

openai.api_type = "azure"
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")
from tech_train_functions import call_open_ai
client = None

Let's focus on question 7 and call it the hard study case:

In [None]:
hard_study_case = df_dev.iloc[9]
pd.DataFrame(hard_study_case)

Let's try using BM25 search to find the most similar question to this in the test split.
First let's create BM25 search DB:

In [None]:
#creating BM25 search DB:
from rank_bm25 import BM25Okapi
import numpy as np

questions_corpus=df_train['question'].tolist()
tokenized_question_corpus = [doc.split(" ") for doc in questions_corpus]
bm25_questions = BM25Okapi(tokenized_question_corpus)

def search_similar(query, bm25_index):
    query=query.lower()
    tokenized_query = query.split(" ")
    doc_scores = bm25_index.get_scores(tokenized_query)
    index=np.argsort(-doc_scores)[0]
    return df_train.iloc[index]


Done! let's search for closest question to our example:

In [None]:
most_similar_in_question = search_similar(hard_study_case["question"], bm25_questions)
pd.DataFrame(most_similar_in_question)

What if we used the answer options only for the search db?

In [None]:
options_corpus=df_train['options'].astype(str).tolist()
tokenized_options_corpus = [doc.split(" ") for doc in options_corpus]
bm25_options = BM25Okapi(tokenized_options_corpus)

In [None]:
most_similar_in_answer = search_similar(str(hard_study_case["options"]), bm25_options)
pd.DataFrame(most_similar_in_answer)

What if we used the extracted entities to determine which is the most similar answer?

In [None]:
df_dev_entities = pd.read_csv("../data/medQA_dev_entities.csv", index_col=0)
df_train_entities = pd.read_csv("../data/medQA_train_entities.csv", index_col=0)
hard_study_case_entities = df_dev_entities["entities"].iloc[9]

df_dev_entities[["question","entities"]].head(2)

In [None]:
entities_corpus=df_train_entities['entities'].astype(str).str.replace("'","").tolist()
tokenized_entities_corpus = [doc.replace("'","").split(" ") for doc in entities_corpus]
bm25_entities = BM25Okapi(tokenized_entities_corpus)

In [None]:
most_similar_in_entities = search_similar(str("angina"), bm25_entities)
pd.DataFrame(most_similar_in_entities)

Let's see if these examples are better than a random one!
First let's create the CoT example:

In [None]:
from tech_train_functions import create_example_CoT
question,answer,options = most_similar_in_answer["question"],most_similar_in_answer["answer"],most_similar_in_answer["options"]
system_message_explainer = "Please explain step by step how to answer this question.\
Explain the condition of the patient and for each option explain why it's correct or incorrect. \
End with 'the correct answer is:' and then the correct answer idx and the correct answer"
example = create_example_CoT(system_message_explainer,question,options,answer, client=client)
print(example[1])

In [None]:
system_message_CoT = "You're a medical expert answering medical questions. Please answer step by step. Start with 'let's take it step by step'\
and end with 'the correct answer is:' and then the correct answer. Please allways end with this phrase"

q = hard_study_case["question"] +" "+ str(hard_study_case["options"])
answer = call_open_ai(system_message_CoT, q, examples=[example], client=client)
print(answer)