## Example

The code will throw an error saying that "embedding dimension 384 does not match collection dimensionality 768" if you try to build two dense vector indices with different embeddings. My guess is that it has something to do with cuda setup

In [1]:
from jiao_rag import RAG
my_rag = RAG(cuda_device='cuda:1')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
my_rag.load_data_from_file('./data/dataset.txt', 'full_data')

In [3]:
#BGE embedding
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": 'cuda:1'}
encode_kwargs = {"normalize_embeddings": True}
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
bge_emb = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
my_rag.create_dense_vector_index(bge_emb, 'bge_embedding', 'full_data')

In [3]:
from langchain_community.embeddings import GPT4AllEmbeddings
my_rag.create_dense_vector_index(GPT4AllEmbeddings(), 'gpt_embedding', 'full_data')

In [4]:
#Init bm25 retriever with lemmatized texts
my_rag.load_data_from_file('./data/dataset.txt', 'full_data_lemma', content_key='lemma')
my_rag.create_bm25_index('bm25_retriever', 'full_data_lemma')

In [8]:
# Expanding course code in the query
from query_processing import expand_course_code_in_query 
import pickle
with open('./util/course_code_to_name_dict.pkl', 'rb') as f:
  loaded_dict = pickle.load(f)
query = 'Do instructors have the right to determine their own policies on attendance requirements?'
q = expand_course_code_in_query(query, loaded_dict)
res = my_rag.dense_retrieval(q, 'bge_embedding', top_k=2, use_mmr=True)
res1 = my_rag.bm25_retrieval(q, 'bm25_retriever', if_lemmatize=True)

# replace lemmatized documents with original ones
for i, r in enumerate(res1):
    res1[i] = my_rag.get_document('full_data', r.metadata['seq_num'])

In [13]:
# This shows how the replacement works
tmp = "CSSE220, CSS E220, CSSE 220"
tmp = expand_course_code_in_query(tmp, loaded_dict)
print(tmp)

CSSE 220 - Object-Oriented Software Development, CSS E220, CSSE 220 - Object-Oriented Software Development


In [10]:
hybrid_res = res + res1
rerank_res = my_rag.rerank(q, hybrid_res)

In [11]:
rerank_res

[[Document(page_content="Student Handbook\nClick HERE to directly access the Student Handbook.\nRules & Procedures\nThe pages below are excerpts from the Rules and Procedures maintained by the Office of the Registrar.\xa0 Click HERE to access all of the rules and procedures.\xa0\xa0\nAttendance\nThe\n cumulative nature, complexity, and fast pace of the courses at \nRose-Hulman make regular classroom attendance a necessity. There are \nalso important benefits to be gained from entering into classroom \ndiscussion, learning\xa0to express one's own ideas, and learning from \nthe ideas of others. The Faculty of Rose-Hulman\nagree that regular attendance is necessary,\nendorse the faculty member's right to require attendance,\nsupport the assessment of grade penalties, including failure in the course for excessive absence,\xa0and\nexpect each instructor to give careful thought to and to announce attendance policy.\nStipulation\nAt\n the beginning of each course, the instructor has the autho

<h1>Evaluation</h1>

In [4]:
def metadata_func(record: dict, metadata: dict):
        metadata['src'] = record.get('source')
        return metadata

In [5]:
from langchain_community.document_loaders import JSONLoader
import json
data_loader = JSONLoader(
    file_path='./data/dataset_semantic_sb_req_grad_ug_ctlg.txt',
    jq_schema='.',
    content_key='text',
    text_content=False,
    json_lines=True,
    metadata_func=metadata_func
)

In [6]:
data = data_loader.load()

In [7]:
len(data)

1483

In [8]:
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI

In [9]:
template = """Ask a question based on the following content: {corpus}
"""

prompt = PromptTemplate.from_template(template)
llm = OpenAI(openai_api_key="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")
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [10]:
my_rag.init_bm25_retriever("full_data")

In [14]:
top_3_hits = 0
top_1_hits = 0
with open('./eval/rag_eval_res_bge_bm25.txt', 'w') as f:
    for i in range(0, len(data)):
        if (i != 0 and i % 50 == 0):
            print(f'{i} done')
        curr_seq_num = i + 1
        question = llm_chain.run(data[i].page_content)
        bge_res = my_rag.bge_query(question, 'full_data')
        bm25_res = my_rag.bm25_query(question, 'full_data')
        hybrid_res = []
        res_dict = set()
        for res in bge_res:
            if res.metadata['seq_num'] in res_dict:
                continue
            res_dict.add(res.metadata['seq_num'])
            hybrid_res.append(res)
        for res in bm25_res:
            if res.metadata['seq_num'] in res_dict:
                continue
            res_dict.add(res.metadata['seq_num'])
            hybrid_res.append(res)
        rerank_res = my_rag.rerank(question, hybrid_res)
        top_3_if_hits = False
        top_1_if_hits = False
        for j in range(0, 3):
            if j == 0 and rerank_res[j][0].metadata['seq_num'] == curr_seq_num:
                top_1_hits += 1
                top_3_hits += 1
                top_3_if_hits = True
                top_1_if_hits = True
            elif rerank_res[j][0].metadata['seq_num'] == curr_seq_num:
                top_3_hits += 1
                top_3_if_hits = True
        f.write(f'Q: {question}\n {data[i].page_content}\nTop 1 hits: {top_1_if_hits}\nTop 3 hits: {top_3_if_hits}\n\n')

50 done
100 done
150 done
200 done
250 done
300 done
350 done
400 done
450 done
500 done
550 done
600 done
650 done
700 done
750 done
800 done
850 done
900 done
950 done
1000 done
1050 done
1100 done
1150 done
1200 done
1250 done
1300 done
1350 done
1400 done
1450 done


In [15]:
top_3_hit_rate = top_3_hits / len(data)
top_1_hit_rate = top_1_hits / len(data)
print(f'top 3 hit rate {top_3_hit_rate}, top 1 hit rate {top_1_hit_rate}')

top 3 hit rate 0.9635873229939312, top 1 hit rate 0.916385704652731


In [18]:
q = "I am the bone of my sword"
import time
t0 = time.time()
rerank_res = my_rag.rerank(q, my_rag.bge_query(q, 'full_data'))
t1 = time.time()
print(t1-t0)

0.18260669708251953


In [None]:
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": cuda_device}
encode_kwargs = {"normalize_embeddings": True}
self.bge_emb = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)