In [None]:
# Install via conda virtual environment using the following command in 

#pip install "sap-llm-commons[all]==0.2.0" --extra-index-url https://int.repositories.cloud.sap/artifactory/api/pypi/proxy-deploy-releases-hyperspace-pypi/simple

#pip install -r requirements.txt 

#aicore configure -k key.txt  (type default)

#git clone https://github.tools.sap/AI-BUS/retrieval-techniques.git
#----------------------------
#Install the FAISS libraries

#pip install llama-index
#pip install llama-index-vector-stores-faiss
#pip install llama-index-embeddings-langchain

### Evaluation Strategy: 

For every query, our retreiver will retrieve top 3,5,7 documents. From our query-corpus dataset, we will match if the documents retreived match/are in the groud truth document mapped to a query.

### Retreiver Strategy:
1. We will use embedding based retreival techniques - text-embedding-ada-002 from GenAI hub as well as keyword based retreivels like BM25. 

2. We will use open-source BGE re-ranking model to augment the retreival performance

3. We will try Fusion ( index & query re-writing) to see if there is an increase in the performance

4. We are not evaluating the cost/latency for this POC which could be influcened by CPUs/ data size. 

5. Vector Store: I'm using FAISS vector store which is one of the most efficient & fastest technique and uses ANN for indexing the document. 

6. Framework : using LlamaIndex framework

#### Defining llm & embedding

In [None]:
from llm_commons.langchain.proxy import ChatOpenAI
from llm_commons.langchain.proxy import OpenAIEmbeddings
from ipywidgets import widgets

llm_model_name = widgets.Dropdown(
    options=[
        "gpt-35-turbo",
        "gpt-35-turbo-16k",
        "gpt-4",
        "gpt-4-32k",
        "gpt-4-turbo",
        "gemini-1.0-pro",
        "gpt-4-vision"
        # "tiiuae--falcon-40b-instruct"
    ],
    value="gpt-35-turbo-16k",
    description="LLM Model Name",
    disabled=False,
)


llm = ChatOpenAI(proxy_model_name=llm_model_name.value)
embeddings = OpenAIEmbeddings(proxy_model_name='text-embedding-ada-002')

from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.llms.langchain import LangChainLLM

llama_llm = LangChainLLM(llm)
llama_emb= LangchainEmbedding(embeddings)

from llama_index.core import Settings
Settings.embed_model = embeddings
Settings.llm = llm

#### Loading open source SciFact Dataset from BIER

In [5]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader

  from tqdm.autonotebook import tqdm


In [16]:
#### Download scifact.zip dataset and unzip the dataset
import logging
import pathlib, os
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(pathlib.Path("C:/Users/I068117/UT_Machine Learning/Custom-AI-Chatbot-RAG").parent.absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

  0%|          | 0/5183 [00:00<?, ?it/s]

#### Each corpus id is a single document, which we will use for our evaluation. There are 5183 documents in the corpus with a mean chunk size of 1400 words

We are not doing any chunking as each document is chunked as per the corpus id

Later, we will explore combining the whole document into one document and then chunking it.

In [8]:
# Read the JSON Lines file
df_corpus = pd.read_json('C:/Users/I068117/UT_Machine Learning/datasets/scifact/corpus.jsonl', lines=True)
df_corpus.head()

Unnamed: 0,_id,title,text,metadata
0,4983,Microstructural development of human newborn c...,Alterations of the architecture of cerebral wh...,{}
1,5836,Induction of myelodysplasia by myeloid-derived...,Myelodysplastic syndromes (MDS) are age-depend...,{}
2,7912,"BC1 RNA, the transcript from a master gene for...",ID elements are short interspersed elements (S...,{}
3,18670,The DNA Methylome of Human Peripheral Blood Mo...,DNA methylation plays an important role in bio...,{}
4,19238,The human myelin basic protein gene is include...,Two human Golli (for gene expressed in the oli...,{}


#### Current chunk size for the documents 

In [345]:
text_length= df_corpus['text'].apply(lambda x : len(x)).mean()
print(text_length)

1401.0839282268955


#### Queries df - There are 1109 queries

In [9]:
# Read the JSON Lines file
df_queries = pd.read_json('C:/Users/I068117/UT_Machine Learning/datasets/scifact/queries.jsonl', lines=True)
df_queries.head()

Unnamed: 0,_id,text,metadata
0,0,0-dimensional biomaterials lack inductive prop...,{}
1,2,1 in 5 million in UK have abnormal PrP positiv...,"{'13734012': [{'sentences': [4], 'label': 'CON..."
2,4,1-1% of colorectal cancer patients are diagnos...,{}
3,6,10% of sudden infant death syndrome (SIDS) dea...,{}
4,9,32% of liver transplantation programs required...,"{'44265107': [{'sentences': [15], 'label': 'SU..."


In [346]:
len(df_queries)

1109

#### Each Query is mapped to a corpus-id document as the ground truth. There are 919 queries mapped to a corpus id in the train Q&A set. 

In [7]:
import pandas as pd
pd.set_option('display.max_rows', None)
df=pd.read_csv('C:/Users/I068117/UT_Machine Learning/datasets/scifact/qrels/train.tsv',sep="\t")
df.head()

Unnamed: 0,query-id,corpus-id,score
0,0,31715818,1
1,2,13734012,1
2,4,22942787,1
3,6,2613775,1
4,9,44265107,1


##### Getting unique query id with all relevant documents as one query could have 1 or more relevant documents 

In [47]:
df_key = df.groupby('query-id')['corpus-id'].agg(list).reset_index()

Unnamed: 0,query-id,corpus-id
0,0,[31715818]
1,2,[13734012]
2,4,[22942787]
3,6,[2613775]
4,9,[44265107]
5,10,[32587939]
6,11,[32587939]
7,12,[33409100]
8,14,[641786]
9,15,[22080671]


In [85]:
#------------optional to load embeddings & storing locally---------------
new=[]
for d in data:
    new.append(embeddings.embed_query(d.page_content))
df_corpus['emb']=new

#df_corpus.to_excel('scifact_emd.xlsx')

#### LLama_Index

In [101]:
import json
data_Json = []

with open("corpus.jsonl", "r") as file:
    for line in file:
        data_Json.append(json.loads(line))

In [45]:
# Write the list of JSON objects to a JSON file
with open('corpus.json', 'w') as json_file:
    json.dump(data_Json, json_file, indent=2)

In [11]:
from llama_index.core import (
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.vector_stores.faiss import FaissVectorStore
from IPython.display import Markdown, display

In [12]:
import faiss
d = 1536
faiss_index = faiss.IndexFlatL2(d)

In [100]:
#converting document from Langchain to Llama_index document
from llama_index.core import Document
docs_llama=[]
for doc in docs_langchain:
    docs_llama.append(Document.from_langchain_format(doc))

In [131]:
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    docs_llama, storage_context=storage_context
)

In [132]:
#Storing index on the disk
index.storage_context.persist(persist_dir="C:/Users/I068117/UT_Machine Learning/Custom-AI-Chatbot-RAG")

### VectorStore Retreiver

In [15]:
from llama_index.core import StorageContext, load_index_from_storage

# load index from disk
vector_store = FaissVectorStore.from_persist_dir("C:/Users/I068117/UT_Machine Learning/Custom-AI-Chatbot-RAG")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="C:/Users/I068117/UT_Machine Learning/Custom-AI-Chatbot-RAG"
)

#loading the index from the local directory to avoid loading the embeddings
index = load_index_from_storage(storage_context=storage_context)
#context = vector_retriever.retrieve("A deficiency of vitamin B12 decreases blood levels of homocysteine")
#print(context)

In [82]:
def vector_retriever_corpus_index(x,topk):
    vector_retriever = index.as_retriever(similarity_top_k=topk)
    context = vector_retriever.retrieve(x)
    ci=[]
    for c in context:
        ci.append(c.node.metadata['idx'])
    return ci

In [83]:
# Function to calculate recall
def calculate_recall(row,k):
    matches = len(set(list(map(str,row['corpus-id']))) & set(list(row[f'top{k}'])))
    return matches / len(row['corpus-id'])

In [89]:
# Function to calculate recall
def calculate_precision(row,k):
    matches = len(set(list(map(str,row['corpus-id']))) & set(list(row[f'top{k}'])))
    return matches / len(row[f'top{k}'])

In [94]:
from typing import List

def calculate_reciprocal_rank(row, k):
    mrr = []
    for idx in row['corpus-id']:
        if str(idx) not in row[f'top{k}']:
            mrr.append(0)
        else:
            mrr.append(1 / (row[f'top{k}'].index(str(idx)) + 1))
    return sum(mrr) / len(mrr)


In [84]:
df_eval=pd.DataFrame(columns=['_id','query'])
df_eval[['_id','query']]= df_queries.iloc[:100,:2]
df_eval_merge= df_eval.merge(df_key,left_on='_id',right_on='query-id',how='left')

In [86]:
df_eval_merge['top3']=df_eval_merge['query'].apply(lambda x: vector_retriever_corpus_index(x,topk=3))
df_eval_merge['top5']=df_eval_merge['query'].apply(lambda x:vector_retriever_corpus_index(x,topk=5))
df_eval_merge['top7']=df_eval_merge['query'].apply(lambda x:vector_retriever_corpus_index(x,topk=7))

In [None]:
df_eval_merge['recall@3'] = df_eval_merge.apply(lambda x:calculate_recall(x,k=3), axis=1)
df_eval_merge['recall@5'] = df_eval_merge.apply(lambda x:calculate_recall(x,k=5), axis=1)
df_eval_merge['recall@7'] = df_eval_merge.apply(lambda x:calculate_recall(x,k=7), axis=1)

df_eval_merge['MRR@3'] = df_eval_merge.apply(lambda x:calculate_reciprocal_rank(x,k=3), axis=1)
df_eval_merge['MRR@5'] = df_eval_merge.apply(lambda x:calculate_reciprocal_rank(x,k=5), axis=1)
df_eval_merge['MRR@7'] = df_eval_merge.apply(lambda x:calculate_reciprocal_rank(x,k=7), axis=1)


In [315]:
eval_df = pd.DataFrame([{
    'Retreival_Technique': 'text-embedding-ada-002',
    'Recall@3': round(df_eval_merge['recall@3'].mean(), 2),
    'Recall@5': round(df_eval_merge['recall@5'].mean(), 2),
    'Recall@7': round(df_eval_merge['recall@7'].mean(), 2),
    'MRR@3': round(df_eval_merge['MRR@3'].mean(), 2),
    'MRR@5': round(df_eval_merge['MRR@5'].mean(), 2),
    'MRR@7': round(df_eval_merge['MRR@7'].mean(), 2),
    'HT@3': round(df_eval_merge[df_eval_merge['recall@3'] != 0].shape[0] / len(df_eval_merge), 2),
    'HT@5': round(df_eval_merge[df_eval_merge['recall@5'] != 0].shape[0] / len(df_eval_merge), 2),
    'HT@7': round(df_eval_merge[df_eval_merge['recall@7'] != 0].shape[0] / len(df_eval_merge), 2)
}])
eval_df

Unnamed: 0,Retreival_Technique,Recall@3,Recall@5,Recall@7,MRR@3,MRR@5,MRR@7,HT@3,HT@5,HT@7
0,text-embedding-ada-002,0.72,0.82,0.84,0.66,0.69,0.69,0.73,0.84,0.86


### Vectorstore retreiver with a re-ranker(top 5)

In [303]:
vector_rerank_retriever = index.as_retriever(similarity_top_k=10)

In [304]:
df_vector_rerank_eval=pd.DataFrame(columns=['_id','query'])
df_vector_rerank_eval[['_id','query']]= df_queries.iloc[:100,:2]
df_vector_rerank_eval_merge= df_vector_rerank_eval.merge(df_key,left_on='_id',right_on='query-id',how='left')

In [305]:
df_vector_rerank_eval_merge['top5']=df_vector_rerank_eval_merge['query'].apply(lambda x:re_ranker(vector_rerank_retriever.retrieve(x),x))

In [None]:
df_vector_rerank_eval_merge['recall@5'] = df_vector_rerank_eval_merge.apply(lambda x:calculate_recall(x,k=5), axis=1)
df_vector_rerank_eval_merge['MRR@5'] = df_vector_rerank_eval_merge.apply(lambda x:calculate_reciprocal_rank(x,k=5), axis=1)

In [321]:
new_data = {
    'Retreival_Technique': 'text-embedding-ada-002 + BGE Re_ranker_top5',
    'Recall@5': round(df_vector_rerank_eval_merge['recall@5'].mean() , 2),  # Adjust these as needed
    'MRR@5': round(df_vector_rerank_eval_merge['MRR@5'].mean(), 2),
    'HT@5': round(df_vector_rerank_eval_merge[df_vector_rerank_eval_merge['recall@5'] != 0].shape[0] / len(df_vector_rerank_eval_merge), 2),
}

eval_df.loc[len(eval_df)] = new_data

eval_df 

Unnamed: 0,Retreival_Technique,Recall@3,Recall@5,Recall@7,MRR@3,MRR@5,MRR@7,HT@3,HT@5,HT@7
0,text-embedding-ada-002,0.72,0.82,0.84,0.66,0.69,0.69,0.73,0.84,0.86
1,text-embedding-ada-002 + BGE Re_ranker_top5,,0.82,,,0.67,,,0.85,


### BM25 retreiver

In [113]:
# We can pass in the index, doctore, or list of nodes to create the retriever
from llama_index.retrievers.bm25 import BM25Retriever
BM25_retriever = BM25Retriever.from_defaults(nodes=docs_llama, similarity_top_k=7)
def BM25_retriever_corpus_index(x):
    context = BM25_retriever.retrieve(x)
    ci=[]
    for c in context:
        ci.append(c.node.metadata['idx'])
    return ci


In [102]:
df_BM_eval=pd.DataFrame(columns=['_id','query'])
df_BM_eval[['_id','query']]= df_queries.iloc[:100,:2]
df_BM_eval_merge= df_BM_eval.merge(df_key,left_on='_id',right_on='query-id',how='left')

In [108]:
df_BM_eval_merge['top3']=df_BM_eval_merge['query'].apply(lambda x: BM25_retriever_corpus_index(x))

In [110]:
df_BM_eval_merge['top5']=df_BM_eval_merge['query'].apply(lambda x:BM25_retriever_corpus_index(x))

In [114]:
df_BM_eval_merge['top7']=df_BM_eval_merge['query'].apply(lambda x:BM25_retriever_corpus_index(x))

In [None]:
# Apply function to each row
df_BM_eval_merge['recall@3'] = df_BM_eval_merge.apply(lambda x:calculate_recall(x,k=3), axis=1)
df_BM_eval_merge['recall@5'] = df_BM_eval_merge.apply(lambda x:calculate_recall(x,k=5), axis=1)
df_BM_eval_merge['recall@7'] = df_BM_eval_merge.apply(lambda x:calculate_recall(x,k=7), axis=1)

In [None]:
df_BM_eval_merge['MRR@3'] = df_BM_eval_merge.apply(lambda x:calculate_reciprocal_rank(x,k=3), axis=1)
df_BM_eval_merge['MRR@5'] = df_BM_eval_merge.apply(lambda x:calculate_reciprocal_rank(x,k=5), axis=1)
df_BM_eval_merge['MRR@7'] = df_BM_eval_merge.apply(lambda x:calculate_reciprocal_rank(x,k=7), axis=1)

In [323]:
new_data = {
    'Retreival_Technique': 'BM25 Retreiver',
    'Recall@3': round(df_BM_eval_merge['recall@3'].mean(), 2),
    'Recall@5': round(df_BM_eval_merge['recall@5'].mean(), 2),
    'Recall@7': round(df_BM_eval_merge['recall@7'].mean(), 2),
    'MRR@3': round(df_BM_eval_merge['MRR@3'].mean(), 2),
    'MRR@5': round(df_BM_eval_merge['MRR@5'].mean(), 2),
    'MRR@7': round(df_BM_eval_merge['MRR@7'].mean(), 2),
    'HT@3': round(df_BM_eval_merge[df_BM_eval_merge['recall@3'] != 0].shape[0] / len(df_BM_eval_merge), 2),
    'HT@5': round(df_BM_eval_merge[df_BM_eval_merge['recall@5'] != 0].shape[0] / len(df_BM_eval_merge), 2),
    'HT@7': round(df_BM_eval_merge[df_BM_eval_merge['recall@7'] != 0].shape[0] / len(df_BM_eval_merge), 2)
}

eval_df.loc[len(eval_df)] = new_data

eval_df

Unnamed: 0,Retreival_Technique,Recall@3,Recall@5,Recall@7,MRR@3,MRR@5,MRR@7,HT@3,HT@5,HT@7
0,text-embedding-ada-002,0.72,0.82,0.84,0.66,0.69,0.69,0.73,0.84,0.86
1,text-embedding-ada-002 + BGE Re_ranker_top5,,0.82,,,0.67,,,0.85,
2,BM25 Retreiver,0.63,0.68,0.7,0.49,0.5,0.5,0.64,0.69,0.71


### BM25 with a re-ranker

In [298]:
BM25_retriever = BM25Retriever.from_defaults(nodes=docs_llama, similarity_top_k=10)

In [299]:
df_BM25_rerank_eval=pd.DataFrame(columns=['_id','query'])
df_BM25_rerank_eval[['_id','query']]= df_queries.iloc[:100,:2]
df_BM25_rerank_eval_merge= df_BM25_rerank_eval.merge(df_key,left_on='_id',right_on='query-id',how='left')

In [300]:
df_BM25_rerank_eval_merge['top5']=df_hybrid_rerank_eval_merge['query'].apply(lambda x:re_ranker(BM25_retriever.retrieve(x),x))

In [None]:
df_BM25_rerank_eval_merge['recall@5'] = df_BM25_rerank_eval_merge.apply(lambda x:calculate_recall(x,k=5), axis=1)
df_BM25_rerank_eval_merge['MRR@5'] = df_BM25_rerank_eval_merge.apply(lambda x:calculate_reciprocal_rank(x,k=5), axis=1)

In [331]:
new_data = {
    'Retreival_Technique': 'BM25 Retreiver + BGE Re_ranker_top5',
    'Recall@5': round(df_BM25_rerank_eval_merge['recall@5'].mean() , 2),  # Adjust these as needed
    'MRR@5': round(df_BM25_rerank_eval_merge['MRR@5'].mean(), 2),
    'HT@5': round(df_BM25_rerank_eval_merge[df_BM25_rerank_eval_merge['recall@5'] != 0].shape[0] / len(df_BM25_rerank_eval_merge), 2),
}

eval_df.loc[3] = new_data

eval_df 

Unnamed: 0,Retreival_Technique,Recall@3,Recall@5,Recall@7,MRR@3,MRR@5,MRR@7,HT@3,HT@5,HT@7
0,text-embedding-ada-002,0.72,0.82,0.84,0.66,0.69,0.69,0.73,0.84,0.86
1,text-embedding-ada-002 + BGE Re_ranker_top5,,0.82,,,0.67,,,0.85,
2,BM25 Retreiver,0.63,0.68,0.7,0.49,0.5,0.5,0.64,0.69,0.71
3,BM25 Retreiver + BGE Re_ranker_top5,,0.71,,,0.61,,,0.73,
4,Hybrid reteiver - text-ada+BM25),,0.77,0.84,,,,,0.77,0.86
5,Hybrid retreiver + BGE Re_ranker_top5,,0.82,,,0.68,,,0.84,


### Hybrid custom retreiver ( Vectorstore+BM25)

In [256]:
from llama_index.core.retrievers import BaseRetriever
                                             
class HybridRetriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever):
        self.vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever
        super().__init__()

    def _retrieve(self, query):
        bm25_nodes = self.bm25_retriever.retrieve(query)
        vector_nodes = self.vector_retriever.retrieve(query)

       # combine the two lists of nodes
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        return all_nodes


In [279]:
vector_retriever = index.as_retriever(similarity_top_k=5)
bm25_retriever = BM25Retriever.from_defaults(nodes=docs_llama, similarity_top_k=5)

hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)

In [271]:
df_hybrid2_eval=pd.DataFrame(columns=['_id','query'])
df_hybrid2_eval[['_id','query']]= df_queries.iloc[:100,:2]
df_hybrid2_eval_merge= df_hybrid2_eval.merge(df_key,left_on='_id',right_on='query-id',how='left')

In [273]:
def get_index(all_nodes):
    ci=[]
    for c in all_nodes:
        ci.append(c.node.metadata['idx'])
    return ci

In [276]:
df_hybrid2_eval_merge['top5']=df_hybrid2_eval_merge['query'].apply(lambda x:get_index(hybrid_retriever.retrieve(x)))

In [280]:
df_hybrid2_eval_merge['top10']=df_hybrid2_eval_merge['query'].apply(lambda x:get_index(hybrid_retriever.retrieve(x)))

In [None]:
df_hybrid2_eval_merge['recall@5'] = df_hybrid2_eval_merge.apply(lambda x:calculate_recall(x,k=5), axis=1)
df_hybrid2_eval_merge['recall@10'] = df_hybrid2_eval_merge.apply(lambda x:calculate_recall(x,k=10), axis=1)

In [328]:
new_data = {
    'Retreival_Technique': 'Hybrid reteiver - text-ada+BM25)',
    'Recall@5': round(df_hybrid2_eval_merge['recall@5'].mean() , 2),  # Adjust these as needed
    'Recall@7': round(df_hybrid2_eval_merge['recall@10'].mean(), 2),
    'HT@5': round(df_hybrid2_eval_merge[df_hybrid2_eval_merge['recall@5'] != 0].shape[0] / len(df_hybrid2_eval_merge), 2),
    'HT@7': round(df_hybrid2_eval_merge[df_hybrid2_eval_merge['recall@10'] != 0].shape[0] / len(df_hybrid2_eval_merge), 2),
}

eval_df.loc[len(eval_df)] = new_data

eval_df 

Unnamed: 0,Retreival_Technique,Recall@3,Recall@5,Recall@7,MRR@3,MRR@5,MRR@7,HT@3,HT@5,HT@7
0,text-embedding-ada-002,0.72,0.82,0.84,0.66,0.69,0.69,0.73,0.84,0.86
1,text-embedding-ada-002 + BGE Re_ranker_top5,,0.82,,,0.67,,,0.85,
2,BM25 Retreiver,0.63,0.68,0.7,0.49,0.5,0.5,0.64,0.69,0.71
3,text-embedding-ada-002 + BGE Re_ranker_top5,,0.71,,,0.61,,,0.73,
4,Hybrid reteiver - text-ada+BM25),,0.77,0.84,,,,,0.77,0.86


### Hybrid custom Retreiver with Re-ranker

In [282]:
from llama_index.core.postprocessor import SentenceTransformerRerank
reranker = SentenceTransformerRerank(top_n=5, model="BAAI/bge-reranker-base")



config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [287]:
df_hybrid_rerank_eval=pd.DataFrame(columns=['_id','query'])
df_hybrid_rerank_eval[['_id','query']]= df_queries.iloc[:100,:2]
df_hybrid_rerank_eval_merge= df_hybrid_rerank_eval.merge(df_key,left_on='_id',right_on='query-id',how='left')

In [294]:
from llama_index.core import QueryBundle
from llama_index.core.schema import NodeWithScore

def re_ranker(retrieved_nodes,query):
    reranked_nodes = reranker.postprocess_nodes(retrieved_nodes,
                                                query_bundle=QueryBundle(query),)
    index_id = []
    index_id = get_index(reranked_nodes)
    return index_id

In [295]:
df_hybrid_rerank_eval_merge['top5']=df_hybrid_rerank_eval_merge['query'].apply(lambda x:re_ranker(hybrid_retriever.retrieve(x),x))

In [None]:
df_hybrid_rerank_eval_merge['recall@5'] = df_hybrid_rerank_eval_merge.apply(lambda x:calculate_recall(x,k=5), axis=1)
df_hybrid_rerank_eval_merge['MRR@5'] = df_hybrid_rerank_eval_merge.apply(lambda x:calculate_reciprocal_rank(x,k=5), axis=1)

In [330]:
new_data = {
    'Retreival_Technique': 'Hybrid retreiver + BGE Re_ranker_top5',
    'Recall@5': round(df_hybrid_rerank_eval_merge['recall@5'].mean() , 2),  # Adjust these as needed
    'MRR@5': round(df_hybrid_rerank_eval_merge['MRR@5'].mean(), 2),
    'HT@5': round(df_hybrid_rerank_eval_merge[df_hybrid_rerank_eval_merge['recall@5'] != 0].shape[0] / len(df_hybrid_rerank_eval_merge), 2),
}

eval_df.loc[len(eval_df)-1] = new_data

eval_df 

Unnamed: 0,Retreival_Technique,Recall@3,Recall@5,Recall@7,MRR@3,MRR@5,MRR@7,HT@3,HT@5,HT@7
0,text-embedding-ada-002,0.72,0.82,0.84,0.66,0.69,0.69,0.73,0.84,0.86
1,text-embedding-ada-002 + BGE Re_ranker_top5,,0.82,,,0.67,,,0.85,
2,BM25 Retreiver,0.63,0.68,0.7,0.49,0.5,0.5,0.64,0.69,0.71
3,text-embedding-ada-002 + BGE Re_ranker_top5,,0.71,,,0.61,,,0.73,
4,Hybrid reteiver - text-ada+BM25),,0.77,0.84,,,,,0.77,0.86
5,Hybrid retreiver + BGE Re_ranker_top5,,0.82,,,0.68,,,0.84,


### Hybrid Fusion retreiver

In [167]:
from llama_index.core.tools import RetrieverTool
index = load_index_from_storage(storage_context=storage_context)
vector_retriever = index.as_retriever(similarity_top_k=3)
bm25_retriever = BM25Retriever.from_defaults(nodes=docs_llama, similarity_top_k=3)

retriever_tools = [
    RetrieverTool.from_defaults(
        retriever=vector_retriever,
        description="Useful in most cases",
    ),
    RetrieverTool.from_defaults(
        retriever=bm25_retriever,
        description="Useful if searching about specific information",
    ),
]

In [142]:
from llama_index.core.retrievers import RouterRetriever

retriever = RouterRetriever.from_defaults(
    retriever_tools=retriever_tools,
    llm=llama_llm,
    select_multi=True,
)

def hybrid_retriever_corpus_index(x):
    context = retriever.retrieve(x)
    ci=[]
    for c in context:
        ci.append(c.node.metadata['idx'])
    return ci

In [136]:
df_hybrid_eval=pd.DataFrame(columns=['_id','query'])
df_hybrid_eval[['_id','query']]= df_queries.iloc[:100,:2]
df_hybrid_eval_merge= df_hybrid_eval.merge(df_key,left_on='_id',right_on='query-id',how='left')

In [143]:
df_hybrid_eval_merge['top5']=df_hybrid_eval_merge['query'].apply(lambda x:hybrid_retriever_corpus_index(x))

  warn_deprecated(
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
  return hasattr(instance, '__fields__') and super().__instancecheck__(instance)
  return hasattr(instance, '__fields__') and super().__instancecheck__(instance)
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skipping...
BM25Retriever does not support embeddings, skippi

In [155]:
df_hybrid_eval_merge['recall@5'] = df_hybrid_eval_merge.apply(lambda x:calculate_recall(x,k=5), axis=1)
print(df_hybrid_eval_merge['recall@5'].mean())

0.785


In [None]:
new_data = {
    'Retreival_Technique': 'Hybrid retreiver + BGE Re_ranker_top5',
    'Recall@5': round(df_hybrid_rerank_eval_merge['recall@5'].mean() , 2),  # Adjust these as needed
    'MRR@5': round(df_hybrid_rerank_eval_merge['MRR@5'].mean(), 2),
    'HT@5': round(df_hybrid_rerank_eval_merge[df_hybrid_rerank_eval_merge['recall@5'] != 0].shape[0] / len(df_hybrid_rerank_eval_merge), 2),
}

eval_df.loc[len(eval_df)-1] = new_data

eval_df 

### Simple Fusion Retreiver (Query rewriting): combine retrieval results from multiple queries and multiple indexes.

In [119]:
from llama_index.core import VectorStoreIndex

index_1 = load_index_from_storage(storage_context=storage_context)
index_2 = load_index_from_storage(storage_context=storage_context)

In [130]:
from llama_index.core.retrievers import QueryFusionRetriever

retriever = QueryFusionRetriever(
    [index_1.as_retriever(), index_2.as_retriever()],
    similarity_top_k=7,
    num_queries=4,  # set this to 1 to disable query generation
    use_async=True,
    verbose=True,
    # query_gen_prompt="...",  # we could override the query generation prompt here
)

def Fusion_vector_retriever_corpus_index(x):
    context = retriever.retrieve(x)
    ci=[]
    for c in context:
        ci.append(c.node.metadata['idx'])
    return ci

In [122]:
df_fusion_eval=pd.DataFrame(columns=['_id','query'])
df_fusion_eval[['_id','query']]= df_queries.iloc[:100,:2]
df_fusion_eval_merge= df_fusion_eval.merge(df_key,left_on='_id',right_on='query-id',how='left')

In [124]:
# apply nested async to run in a notebook
import nest_asyncio

nest_asyncio.apply()

df_fusion_eval_merge['top3']=df_fusion_eval_merge['query'].apply(lambda x: Fusion_vector_retriever_corpus_index(x))

Generated queries:
1. What are the properties of 0-dimensional biomaterials?
2. How do 0-dimensional biomaterials differ from biomaterials with inductive properties?
3. Can 0-dimensional biomaterials be modified to have inductive properties?
Generated queries:
- Prevalence of abnormal PrP positivity in UK population
- Causes of abnormal PrP positivity
- Treatment options for abnormal PrP positivity
Generated queries:
1. What are the risk factors for developing regional or distant metastases in colorectal cancer patients?
2. How is the stage of colorectal cancer determined and what are the treatment options for patients with regional or distant metastases?
3. What are the survival rates for colorectal cancer patients with regional or distant metastases and what factors affect their prognosis?
Generated queries:
1. What are the common causes of sudden infant death syndrome (SIDS)?
2. How can SIDS be prevented in newborns?
3. Are there any risk factors associated with SIDS in infants unde

Generated queries:
1. What is the susceptibility of hematopoietic progenitor cells to HIV-1 infection?
2. How does HIV-1 infection affect hematopoietic progenitor cells?
3. What are the ex vivo methods used to study the susceptibility of hematopoietic progenitor cells to HIV-1 infection?
Generated queries:
1. What is the relationship between HNF4A mutation and diabetes risk in children?
2. Can HNF4A mutation be detected early to predict the risk of developing diabetes by age 14?
3. Are there any preventive measures or treatments available for individuals with HNF4A mutations to reduce the risk of diabetes onset by age 14?
Generated queries:
1. What are the symptoms of HNF4A mutation in diabetes patients?
2. How does the HNF4A mutation affect insulin production in the body?
3. Are there any treatment options available for individuals with HNF4A mutation and diabetes?
Generated queries:
1. What is the function of the DGKK gene?
2. How does a single nucleotide variant in the DGKK gene inc

Generated queries:
1. What is the role of adult tissue-resident macrophages in the immune system?
2. How do embryonal yolk sac and fetal liver contribute to the development of adult tissue-resident macrophages?
3. Are there any specific markers or factors that determine the differentiation of adult tissue-resident macrophages from embryonal yolk sac and fetal liver?
Generated queries:
1. What are the mechanisms behind the decreased susceptibility of aged patients to ischaemia/reperfusion injury?
2. Are there any specific treatments or interventions that can enhance the resilience of aged patients against ischaemia/reperfusion injury?
3. What are the long-term outcomes for aged patients who have experienced ischaemia/reperfusion injury compared to younger patients?
Generated queries:
1. What are the risk factors for ischaemia/reperfusion injury in elderly patients?
2. How does age affect the severity of ischaemia/reperfusion injury?
3. What are the mechanisms underlying the increased su

Generated queries:
1. What is the role of EBI2 in B cell plasmablast differentiation and antibody production?
2. How does continuous expression of EBI2 contribute to B cell plasmablast differentiation and antibody production?
3. Are there any other factors involved in B cell plasmablast differentiation and antibody production besides EBI2?
Generated queries:
1. How does B3-Galectin affect cell resistance to tyrosine kinase inhibitors?
2. What is the role of the KRAS-RalB signaling complex in the mechanism of action of B3-Galectin?
3. Can EGFR signaling be modulated by B3-Galectin to decrease cell resistance to tyrosine kinase inhibitors?
Generated queries:
1. What is the role of B3-Galectin in cell resistance to tyrosine kinase inhibitors?
2. How does the engagement of the alternate KRAS-RalB signaling complex downstream of EGFR contribute to cell resistance to TKIs?
3. Are there any other signaling pathways involved in cell resistance to TKIs apart from the KRAS-RalB pathway engaged b

In [127]:
df_fusion_eval_merge['top5']=df_fusion_eval_merge['query'].apply(lambda x: Fusion_vector_retriever_corpus_index(x))

Generated queries:
1. What are the advantages of using biomaterials with inductive properties?
2. Can 0-dimensional biomaterials be modified to possess inductive properties?
3. What are some examples of biomaterials that lack inductive properties?
Generated queries:
1. What is the prevalence of abnormal PrP positivity in the UK?
2. How many people in the UK have abnormal PrP positivity?
3. What are the symptoms and diagnosis of abnormal PrP positivity in the UK?
Generated queries:
1. What are the treatment options for colorectal cancer patients with regional or distant metastases?
2. What is the survival rate for colorectal cancer patients diagnosed with regional or distant metastases?
3. What are the risk factors for developing regional or distant metastases in colorectal cancer patients?
Generated queries:
1. What are the risk factors for sudden infant death syndrome (SIDS) in newborns?
2. How can parents reduce the risk of sudden infant death syndrome (SIDS) in infants under 6 month

Generated queries:
1. How does a high microerythrocyte count impact the severity of anemia in individuals with homozygous alpha (+)-thalassemia trait?
2. Can a high microerythrocyte count be used as a protective factor against severe anemia in individuals with homozygous alpha (+)-thalassemia trait?
3. What is the relationship between microerythrocyte count and the risk of severe anemia in individuals with homozygous alpha (+)-thalassemia trait?
Generated queries:
1. What is the percentage of hematopoietic progenitor cells susceptible to HIV-1 infection ex vivo?
2. How does HIV-1 infection affect the susceptibility of hematopoietic progenitor cells ex vivo?
3. Are there any specific factors that contribute to the low susceptibility of hematopoietic progenitor cells to HIV-1 infection ex vivo?
Generated queries:
1. What are the symptoms of HNF4A mutation in diabetes?
2. Are there any treatments or interventions available for individuals with HNF4A mutation and an increased risk of diabe

Generated queries:
1. What is the role of adult tissue-resident macrophages in the immune system?
2. How are macrophages seeded in different tissues during fetal development?
3. What are the characteristics and functions of tissue-resident macrophages in adults?
Generated queries:
1. What is the role of self-renewing capacity in adult tissue-resident macrophages?
2. How do adult tissue-resident macrophages maintain their self-renewing capacity?
3. Are there any studies investigating the factors influencing the self-renewal of adult tissue-resident macrophages?
Generated queries:
1. How do adult tissue-resident macrophages originate from the embryonal yolk sac and fetal liver?
2. Role of the embryonal yolk sac and fetal liver in the development of adult tissue-resident macrophages.
3. Differences between adult tissue-resident macrophages derived from the embryonal yolk sac and fetal liver.
Generated queries:
1. What are the functions of adult tissue-resident macrophages?
2. How do embry

Generated queries:
1. What are the common markers of myofibroblasts in patients exposed to radiation?
2. How does radiation exposure affect the activation of myofibroblasts in patients?
3. What are the potential health risks associated with activated myofibroblasts in patients exposed to radiation?
Generated queries:
1. What are the benefits of autologous transplantation of mesenchymal stem cells in improving graft function?
2. How does induction therapy with anti-interleukin-2 receptor antibodies compare to autologous transplantation of mesenchymal stem cells in terms of graft function?
3. Are there any studies comparing the efficacy of autologous transplantation of mesenchymal stem cells and induction therapy with anti-interleukin-2 receptor antibodies in improving graft function?
Generated queries:
1. Role of autophagy in insulin resistance in the liver
2. Mechanisms linking autophagy deficiency and insulin resistance in the liver
3. Impact of autophagy impairment on liver health an

In [131]:
df_fusion_eval_merge['top7']=df_fusion_eval_merge['query'].apply(lambda x: Fusion_vector_retriever_corpus_index(x))

Generated queries:
1. What are examples of 0-dimensional biomaterials?
2. How do 0-dimensional biomaterials differ from other biomaterials?
3. What are the potential applications of 0-dimensional biomaterials?
Generated queries:
1. What is PrP positivity and its significance in the UK population?
2. Prevalence of abnormal PrP positivity in the UK population.
3. Causes and risk factors associated with abnormal PrP positivity in the UK.
Generated queries:
1. What are the common symptoms of colorectal cancer?
2. What are the risk factors for developing metastatic colorectal cancer?
3. How is the stage of colorectal cancer determined?
Generated queries:
1. What are the risk factors for sudden infant death syndrome (SIDS) in newborns?
2. How can SIDS deaths be prevented in newborns under 6 months of age?
Generated queries:
- What were the reasons for liver transplantation programs requiring patients to discontinue methadone treatment in 2001?
- How did liver transplantation programs handle 

Generated queries:
1. What is the susceptibility of hematopoietic progenitor cells to HIV-1 infection in vivo?
2. Are there any methods to increase the susceptibility of hematopoietic progenitor cells to HIV-1 infection ex vivo?
3. How does the low percentage of susceptible hematopoietic progenitor cells affect the progression of HIV-1 infection?
Generated queries:
1. What are the symptoms of HNF4A mutation in diabetes patients?
2. How does the mutation in HNF4A affect insulin production?
3. Are there any known treatments for individuals with HNF4A mutation and an increased risk of diabetes?
Generated queries:
1. What are the symptoms of HNF4A mutation in diabetes?
2. How does HNF4A mutation affect the development of diabetes?
3. Is there a genetic test available for detecting HNF4A mutation and predicting the risk of diabetes?
Generated queries:
1. What is the role of the DGKK gene in hypospadias?
2. Are there any other genes associated with increased risk of hypospadias?
3. How does 

Generated queries:
1. What is the role of adult tissue-resident macrophages in the immune system?
2. How do embryonal yolk sac and fetal liver contribute to the development of tissue-resident macrophages?
3. Are there any specific markers or characteristics that distinguish adult tissue-resident macrophages from other macrophage populations?
Generated queries:
1. What are the factors that make aged patients less susceptible to ischaemia/reperfusion injury?
2. Are there any specific treatments or interventions that can further reduce the susceptibility of aged patients to ischaemia/reperfusion injury?
3. What are the physiological changes that occur in aged patients that contribute to their decreased susceptibility to ischaemia/reperfusion injury?
Generated queries:
1. How does aging affect the susceptibility of patients to ischaemia/reperfusion injury?
2. What are the risk factors for ischaemia/reperfusion injury in aged patients?
3. Are there any preventive measures or treatments spec

Generated queries:
1. What is the role of B3-Galectin in cell resistance to tyrosine kinase inhibitors?
2. How does B3-Galectin engage the alternate KRAS-RalB signaling complex downstream of EGFR?
3. What are the mechanisms by which B3-Galectin decreases cell resistance to tyrosine kinase inhibitors?
Generated queries:
1. What is the mechanism of action of B3-Galectin in increasing cell resistance to tyrosine kinase inhibitors?
2. How does the alternate KRAS-RalB signaling complex downstream of EGFR affect cell resistance to tyrosine kinase inhibitors in the presence of B3-Galectin?
3. Are there any other molecules or proteins involved in the interaction between B3-Galectin and the alternate KRAS-RalB signaling complex that contribute to cell resistance to tyrosine kinase inhibitors?
Generated queries:
1. Role of BCL-2 activation in apoptosis regulation
2. Mechanisms by which c-Myc induces apoptosis
3. Interplay between BCL-2 activation and c-Myc in cell survival and apoptosis
Generate

In [None]:
df_fusion_eval_merge['recall@3'] = df_fusion_eval_merge.apply(lambda x:calculate_recall(x,k=3), axis=1)
df_fusion_eval_merge['recall@5'] = df_fusion_eval_merge.apply(lambda x:calculate_recall(x,k=5), axis=1)
df_fusion_eval_merge['recall@7'] = df_fusion_eval_merge.apply(lambda x:calculate_recall(x,k=7), axis=1)

In [332]:
new_data = {
    'Retreival_Technique': 'Simple Fusion Retreiver',
    'Recall@3': round(df_fusion_eval_merge['recall@3'].mean(), 2),
    'Recall@5': round(df_fusion_eval_merge['recall@5'].mean(), 2),
    'Recall@7': round(df_fusion_eval_merge['recall@7'].mean(), 2),
    'HT@3': round(df_fusion_eval_merge[df_fusion_eval_merge['recall@3'] != 0].shape[0] / len(df_fusion_eval_merge), 2),
    'HT@5': round(df_fusion_eval_merge[df_fusion_eval_merge['recall@5'] != 0].shape[0] / len(df_fusion_eval_merge), 2),
    'HT@7': round(df_fusion_eval_merge[df_fusion_eval_merge['recall@7'] != 0].shape[0] / len(df_fusion_eval_merge), 2)
}

eval_df.loc[len(eval_df)] = new_data

Unnamed: 0,Retreival_Technique,Recall@3,Recall@5,Recall@7,MRR@3,MRR@5,MRR@7,HT@3,HT@5,HT@7
0,text-embedding-ada-002,0.72,0.82,0.84,0.66,0.69,0.69,0.73,0.84,0.86
1,text-embedding-ada-002 + BGE Re_ranker_top5,,0.82,,,0.67,,,0.85,
2,BM25 Retreiver,0.63,0.68,0.7,0.49,0.5,0.5,0.64,0.69,0.71
3,BM25 Retreiver + BGE Re_ranker_top5,,0.71,,,0.61,,,0.73,
4,Hybrid reteiver - text-ada+BM25),,0.77,0.84,,,,,0.77,0.86
5,Hybrid retreiver + BGE Re_ranker_top5,,0.82,,,0.68,,,0.84,
6,Simple Fusion Retreiver,0.61,0.81,0.83,,,,0.63,0.82,0.84


In [334]:
# After replacing NAN with '-' : 
eval_df.fillna('_',inplace=True)

Unnamed: 0,Retreival_Technique,Recall@3,Recall@5,Recall@7,MRR@3,MRR@5,MRR@7,HT@3,HT@5,HT@7
0,text-embedding-ada-002,0.72,0.82,0.84,0.66,0.69,0.69,0.73,0.84,0.86
1,text-embedding-ada-002 + BGE Re_ranker_top5,-,0.82,-,-,0.67,-,-,0.85,-
2,BM25 Retreiver,0.63,0.68,0.7,0.49,0.5,0.5,0.64,0.69,0.71
3,BM25 Retreiver + BGE Re_ranker_top5,-,0.71,-,-,0.61,-,-,0.73,-
4,Hybrid reteiver - text-ada+BM25),-,0.77,0.84,-,-,-,-,0.77,0.86
5,Hybrid retreiver + BGE Re_ranker_top5,-,0.82,-,-,0.68,-,-,0.84,-
6,Simple Fusion Retreiver,0.61,0.81,0.83,-,-,-,0.63,0.82,0.84


## Trying with different chunking strategies

In [96]:
from llama_index.readers.json import JSONReader
reader = JSONReader()
documents = reader.load_data(input_file="corpus.json", extra_info={})



In [97]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(documents)
nodes[0]

TextNode(id_='a1c1d18a-c8c0-42b0-b487-ff011cb7c120', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='1b59f911-6826-408b-b63d-39f8889907d1', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='60bb4a20379ab6ffd178340273c6b06e4e6f5429c3d028ca67a266a5d064e592'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='795571e9-aabe-4468-adfa-225a533202cb', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='d3ecde2d3da4f8b1552fe47c0f6544354c0d19975ba1b45f00d62768e7b5f3bb')}, text='"_id": "4983",\n"title": "Microstructural development of human newborn cerebral white matter assessed in vivo by diffusion tensor magnetic resonance imaging.",\n"text": "Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequ

## Langchain

In [99]:
from langchain.docstore.document import Document
import json
docs_langchain=[]
        # Load JSON file
with open("corpus.json") as file:
    data = json.load(file)

    for item in data:
        index = item['_id']
        title = item['title']
        text = item['text']
        metadata = dict(idx =index, title= title, extra = item['metadata'])
        docs_langchain.append(Document(page_content=text, metadata=metadata))

In [125]:
docs_langchain[0]

Document(page_content='Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 versus 1.1 microm2/ms). Relative anisotropy was higher the closer birth was 

In [70]:
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(docs_langchain[:1000], embeddings)

In [72]:
retriever = vectorstore.as_retriever()

In [123]:
result = retriever.invoke("RANK-RANKL pathway signalling has no known association with development of Aire-expressing medullary thymic epithelial cells.")
result

[Document(page_content='Medullary thymic epithelial cells (mTECs) establish T cell self-tolerance through the expression of autoimmune regulator (Aire) and peripheral tissue-specific self-antigens. However, signals underlying mTEC development remain largely unclear. Here, we demonstrate crucial regulation of mTEC development by receptor activator of NF-kappaB (RANK) and CD40 signals. Whereas only RANK signaling was essential for mTEC development during embryogenesis, in postnatal mice, cooperation between CD40 and RANK signals was required for mTEC development to successfully establish the medullary microenvironment. Ligation of RANK or CD40 on fetal thymic stroma in vitro induced mTEC development in a tumor necrosis factor-associated factor 6 (TRAF6)-, NF-kappaB inducing kinase (NIK)-, and IkappaB kinase beta (IKKbeta)-dependent manner. These results show that developmental-stage-dependent cooperation between RANK and CD40 promotes mTEC development, thereby establishing self-tolerance

In [104]:
result = vectorstore.similarity_search_with_score("RANK-RANKL pathway signalling has no known association with development of Aire-expressing medullary thymic epithelial cells.")

### Appendix

In [57]:
"""Loader that loads data from JSON."""
import json
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class JSONLoader(BaseLoader):
    def __init__(self,file_path: Union[str, Path],content_key: Optional[str] = None):
        self.file_path = Path(file_path).resolve()
        self._content_key = content_key
        
    def load(self) -> List[Document]:
        """Load and return documents from the JSON file."""

        docs=[]
        # Load JSON file
        with open("corpus.json") as file:
            data = json.load(file)

            for item in data:
                index = item['_id']
                title = item['title']
                text = item['text']
                metadata = dict(idx =index, title= title, extra = item['metadata'])
                docs.append(Document(page_content=text, metadata=metadata))
        
        return docs

file_path='corpus.json'
loader = JSONLoader(file_path=file_path)
data = loader.load()