## an Instance

In [1]:
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": 'cuda:5'}
encode_kwargs = {"normalize_embeddings": True}
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
bge_emb = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from jiao_rag import RAG
my_rag = RAG(cuda_device='cuda:5')
my_rag.load_data_from_file('./data/dataset_courseCtlgProcessed.txt', 'dataset')
my_rag.create_dense_vector_index(bge_emb, 'bge_embedding_dataset', 'dataset')
my_rag.load_data_from_file('./data/dataset_courseCtlgProcessed.txt', 'dataset_lem', content_key='lemma')
my_rag.create_bm25_index('bm25_retriever_dataset', 'dataset_lem')

In [16]:
from data.query_processing import expand_course_code_in_query 
import pickle
with open('./data/course_code_to_name_dict.pkl', 'rb') as f:
  loaded_dict = pickle.load(f)
query = 'What are the required courses if I major in civil engineering?'
q = expand_course_code_in_query(query, loaded_dict)
res = my_rag.dense_retrieval(q, 'bge_embedding_dataset', top_k=3, use_mmr=True)
res1 = my_rag.bm25_retrieval(q, 'bm25_retriever_dataset', if_lemmatize=True)
hybrid_res = res + res1
hybrid_res = my_rag.remove_duplicate_doc(hybrid_res)
rerank_res = my_rag.rerank(q, hybrid_res)
for i, r in enumerate(rerank_res):
    if (i <= 3):
        print(r)
        print()

[Document(page_content='Here is the csv that describes the required courses for civil engineering: b,Requirements for,CIVIL ENGINEERING,Unnamed: 2,Unnamed: 3,...Catalog Year 2023-2024,Unnamed: 5,\r\n0,Student Name:,Insert your name here,,Priority of this major:,,1st or 2nd,\r\n1,Course Requirements,,Course Title,Cr\nHr,Term courses were or\nwill be taken,Course\nGrade,whether required for second major\r\n2,HSSA Requirements,36 credits,,,,,\r\n3,   HSSA #1 - HUM H190,,First-Year Writing Seminar,4 credits,,,required for first major in civil engineering; no information is provided about whether this course is required for second major in civil engineering\r\n4,   HSSA #2  ,,Humanities & Arts (H),4 credits,,,required for first major in civil engineering; no information is provided about whether this course is required for second major in civil engineering\r\n5,   HSSA #3  ,,Social Sciences (S),4 credits,,,required for first major in civil engineering; no information is provided about wheth

## Evaluation Setup

The code will throw an error saying that "embedding dimension 384 does not match collection dimensionality 768" if you try to build two dense vector indices with different embeddings. My guess is that it has something to do with cuda setup

In [1]:
from jiao_rag import RAG
my_rag = RAG(cuda_device='cuda:5')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
my_rag.load_data_from_file('./data/Eval-sem45.txt', 'sem45')
my_rag.load_data_from_file('./data/Eval-sem55.txt', 'sem55')
my_rag.load_data_from_file('./data/Eval-sem65.txt', 'sem65')
my_rag.load_data_from_file('./data/Eval-sem75.txt', 'sem75')
my_rag.load_data_from_file('./data/Eval-sem85.txt', 'sem85')
my_rag.load_data_from_file('./data/Eval-sem95.txt', 'sem95')

In [3]:
#BGE embedding
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": 'cuda:5'}
encode_kwargs = {"normalize_embeddings": True}
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
bge_emb = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
my_rag.create_dense_vector_index(bge_emb, 'bge_embedding_sem45', 'sem45')
my_rag.create_dense_vector_index(bge_emb, 'bge_embedding_sem55', 'sem55')
my_rag.create_dense_vector_index(bge_emb, 'bge_embedding_sem65', 'sem65')
my_rag.create_dense_vector_index(bge_emb, 'bge_embedding_sem75', 'sem75')
my_rag.create_dense_vector_index(bge_emb, 'bge_embedding_sem85', 'sem85')
my_rag.create_dense_vector_index(bge_emb, 'bge_embedding_sem95', 'sem95')
hybrid_res = res + res1
hybrid_res = my_rag.remove_duplicate_doc(hybrid_res)
rerank_res = my_rag.rerank(q, hybrid_res)
print(rereank_res)

In [4]:
# from langchain_community.embeddings import GPT4AllEmbeddings
# my_rag.create_dense_vector_index(GPT4AllEmbeddings(), 'gpt_embedding', 'full_data')

In [5]:
#Init bm25 retriever with lemmatized texts
my_rag.load_data_from_file('./data/Eval-sem45.txt', 'sem45_lem', content_key='lemma')
my_rag.load_data_from_file('./data/Eval-sem55.txt', 'sem55_lem', content_key='lemma')
my_rag.load_data_from_file('./data/Eval-sem65.txt', 'sem65_lem', content_key='lemma')
my_rag.load_data_from_file('./data/Eval-sem75.txt', 'sem75_lem', content_key='lemma')
my_rag.load_data_from_file('./data/Eval-sem85.txt', 'sem85_lem', content_key='lemma')
my_rag.load_data_from_file('./data/Eval-sem95.txt', 'sem95_lem', content_key='lemma')
my_rag.create_bm25_index('bm25_retriever_sem45', 'sem45_lem')
my_rag.create_bm25_index('bm25_retriever_sem55', 'sem55_lem')
my_rag.create_bm25_index('bm25_retriever_sem65', 'sem65_lem')
my_rag.create_bm25_index('bm25_retriever_sem75', 'sem75_lem')
my_rag.create_bm25_index('bm25_retriever_sem85', 'sem85_lem')
my_rag.create_bm25_index('bm25_retriever_sem95', 'sem95_lem')

In [26]:
# Expanding course code in the query
from query_processing import expand_course_code_in_query 
import pickle
with open('./course_code_to_name_dict.pkl', 'rb') as f:
  loaded_dict = pickle.load(f)
query = 'What courses do I want to double major in Math and Biochemistry'
q = expand_course_code_in_query(query, loaded_dict)
res = my_rag.dense_retrieval(q, 'bge_embedding_sem75', top_k=3, use_mmr=True)
res1 = my_rag.bm25_retrieval(q, 'bm25_retriever_sem75', if_lemmatize=True)
res2 = [0, 0, 0]
# replace lemmatized documents with original ones
for i, r in enumerate(res1):
    print(r.metadata['seq_num'])
    res2[i] = my_rag.get_document('sem75', r.metadata['seq_num'])

476
544
114


In [30]:
res1[1]

Document(page_content='CSSE be require physic major be plan double major CSSE, CPE, EE, MA, ME **MA (F S) be substitute MA (W) †Free, Math technical elective be only suggestion change subject offering.', metadata={'source': '/home/jiaoq/rag_proj/data/Eval-sem75.txt', 'seq_num': 544, 'src': 'Physics.txt'})

In [7]:
# This shows how the replacement works
tmp = "CSSE220, CSS E220, CSSE 220"
tmp = expand_course_code_in_query(tmp, loaded_dict)
print(tmp)

CSSE 220 - Object-Oriented Software Development, CSS E220, CSSE 220 - Object-Oriented Software Development


In [None]:
for tmp in res:
    print(tmp, '\n')
for tmp in res1:
    print(tmp, '\n')


In [9]:
hybrid_res = res + res1
hybrid_res = my_rag.remove_duplicate_doc(hybrid_res)
print(len(hybrid_res))
rerank_res = my_rag.rerank(q, hybrid_res)

5


In [10]:
len(hybrid_res)

5

In [None]:
rerank_res

In [10]:
my_rag.load_data_from_file('./data/new_complete_dataset_credit_replaced.txt', 'full_data_lemma', content_key='lemma')
my_rag.create_bm25_index('bm25_retriever', 'full_data_lemma')
res1 = my_rag.bm25_retrieval('I am the bone of my sword', 'bm25_retriever', if_lemmatize=True)

[Document(page_content='BE Research Methods Biomechanics Credit Hours:4 Term Available: Winter Graduate Studies Eligible: Yes Prerequisites: BE consent instructor Corequisites: None Focuses wide range research method use field biomechanics. Current literature be review analyze advantage disadvantage various research methodologies. Topics vary base student interest background, include topic such motion/force analysis, soft tissue bone mechanics, joint biomechanics, analysis joint replacements, fracture fixation. Laboratory activity reinforce lecture topic student have opportunity investigate biomechanics research topic area interest.', metadata={'source': '/home/jiaoq/rag_proj/data/new_complete_dataset_credit_replaced.txt', 'seq_num': 608, 'src': 'https://www.rose-hulman.edu/academics/course-catalog/current/programs/Biomedical%20Engineering/be-550.html'}),
 Document(page_content='Orthopedic Mobility Impairment Guidance Information Orthopedic impairment mean severe orthopedic impairment 

<h1>Evaluation</h1>

In [6]:
def metadata_func(record: dict, metadata: dict):
        metadata['src'] = record.get('source')
        return metadata

In [19]:
from langchain_community.document_loaders import JSONLoader
import json
data_loader = JSONLoader(
    file_path='./data/Eval-sem55.txt',
    jq_schema='.',
    content_key='text',
    text_content=False,
    json_lines=True,
    metadata_func=metadata_func
)

In [20]:
data = data_loader.load()
len(data)

1390

In [18]:
data_loader2 = JSONLoader(
    file_path='./data/Eval-sem55.txt',
    jq_schema='.',
    content_key='text',
    text_content=False,
    json_lines=True,
    metadata_func=metadata_func
)
data2 = data_loader2.load()
len(data2)

1390

In [9]:
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI

In [10]:
template = """Ask a question based on the following content: {corpus}
"""

prompt = PromptTemplate.from_template(template)
llm = OpenAI(openai_api_key="TurningBackTime")
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [10]:
q = "c"

In [21]:
top_3_hits = 0
top_1_hits = 0
with open('./eval/rag_eval_res_sem55_bge_bm25_run2.txt', 'w') as f:
    for i in range(0, len(data)):
        if (i != 0 and i % 50 == 0):
            print(f'{i} done')
        curr_seq_num = i + 1
        question = llm_chain.run(data[i].page_content)
        bge_res = my_rag.dense_retrieval(question, 'bge_embedding_sem55')
        bm25_res = my_rag.bm25_retrieval(question, 'bm25_retriever_sem55', if_lemmatize=True)
        bm25_res_new = []
        # replace lemmatized documents with original ones
        for m, n in enumerate(bm25_res):
            bm25_res_new.append(my_rag.get_document('sem55', n.metadata['seq_num']))
        hybrid_res = bge_res + bm25_res_new
        hybrid_res = my_rag.remove_duplicate_doc(hybrid_res)
        rerank_res = my_rag.rerank(question, hybrid_res)
        top_3_if_hits = False
        top_1_if_hits = False
        for j in range(0, 3):
            if j == 0 and rerank_res[j][0].metadata['seq_num'] == curr_seq_num:
                top_1_hits += 1
                top_3_hits += 1
                top_3_if_hits = True
                top_1_if_hits = True
            elif rerank_res[j][0].metadata['seq_num'] == curr_seq_num:
                top_3_hits += 1
                top_3_if_hits = True
        f.write(f'Q: {question}\n {data[i].page_content}\nTop 1 hits: {top_1_if_hits}\nTop 3 hits: {top_3_if_hits}\n\n')

50 done
100 done
150 done
200 done
250 done
300 done
350 done
400 done
450 done
500 done
550 done
600 done
650 done
700 done
750 done
800 done
850 done
900 done
950 done
1000 done
1050 done
1100 done
1150 done
1200 done
1250 done
1300 done
1350 done


In [54]:
#95
top_3_hit_rate = top_3_hits / len(data)
top_1_hit_rate = top_1_hits / len(data)
print(f'top 3 hit rate {top_3_hit_rate}, top 1 hit rate {top_1_hit_rate}')

top 3 hit rate 0.8388157894736842, top 1 hit rate 0.34539473684210525


In [59]:
#85
top_3_hit_rate = top_3_hits / len(data)
top_1_hit_rate = top_1_hits / len(data)
print(f'top 3 hit rate {top_3_hit_rate}, top 1 hit rate {top_1_hit_rate}')

top 3 hit rate 0.8202054794520548, top 1 hit rate 0.3082191780821918


In [64]:
#75
top_3_hit_rate = top_3_hits / len(data)
top_1_hit_rate = top_1_hits / len(data)
print(f'top 3 hit rate {top_3_hit_rate}, top 1 hit rate {top_1_hit_rate}')

top 3 hit rate 0.8801410105757932, top 1 hit rate 0.6733254994124559


In [72]:
#65
top_3_hit_rate = top_3_hits / len(data)
top_1_hit_rate = top_1_hits / len(data)
print(f'top 3 hit rate {top_3_hit_rate}, top 1 hit rate {top_1_hit_rate}')

top 3 hit rate 0.7909654561558902, top 1 hit rate 0.4065544729849424


In [22]:
#55
top_3_hit_rate = top_3_hits / len(data)
top_1_hit_rate = top_1_hits / len(data)
print(f'top 3 hit rate {top_3_hit_rate}, top 1 hit rate {top_1_hit_rate}')

top 3 hit rate 0.7589928057553957, top 1 hit rate 0.24244604316546764


In [14]:
#45
top_3_hit_rate = top_3_hits / len(data)
top_1_hit_rate = top_1_hits / len(data)
print(f'top 3 hit rate {top_3_hit_rate}, top 1 hit rate {top_1_hit_rate}')

top 3 hit rate 0.8402903811252269, top 1 hit rate 0.5523290986085905


In [18]:
q = "I am the bone of my sword"
import time
t0 = time.time()
rerank_res = my_rag.rerank(q, my_rag.bge_query(q, 'full_data'))
t1 = time.time()
print(t1-t0)

0.18260669708251953


In [None]:
model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": cuda_device}
encode_kwargs = {"normalize_embeddings": True}
self.bge_emb = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)