In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Document
from llama_index.core import VectorStoreIndex



In [None]:
#Specify Storage Context as Pinecone Index
from pinecone import Pinecone
from pinecone import ServerlessSpec
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext

Settings.chunk_size = 1024
Settings.chunk_overlap = 100

pc = Pinecone(api_key = "bb081579-c077-4ca0-a107-c1a21d0e392c") #Pinecone API

pinecone_index = pc.Index("vector-only-rag-768") #Pinecone Index

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
#Load Documents
documents = SimpleDirectoryReader(input_dir = "/Users/sameerprasadkoppolu/Desktop/MSDS Coursework/LLMs/Project/uscis_new").load_data(num_workers=4, show_progress = True)
documents

In [None]:
#Load Embedding Model
import torch 

Settings.embed_model = HuggingFaceEmbedding(
    model_name = "Snowflake/snowflake-arctic-embed-m-v1.5", trust_remote_code = True #768 Embedding Dimension
)


In [None]:
# #Create VectorStoreIndex - Run this only the first time when the vectors need to be stored in Pinecone DB
# #Ensure that the vector dimension in the DB matches the d_model of your embedding model
# #Default Chunk Size is 1024 size with default overlap of 200 - https://docs.llamaindex.ai/en/stable/optimizing/basic_strategies/basic_strategies/

# index = VectorStoreIndex.from_documents(documents, 
#                                         embed_model = Settings.embed_model,
#                                         storage_context = storage_context, show_progress = True)

In [None]:
#Getting the Vectors from Pinecone DB - Do this if the Pinecone Index already has vectors

index = VectorStoreIndex.from_vector_store(vector_store=vector_store, embed_model = Settings.embed_model)

In [None]:
#Choose the LLM for Response Synthesis
from llama_index.llms.ollama import Ollama

Settings.llm = Ollama(model="llama3:instruct", request_timeout=360.0, temperature = 0.3, num_beams = 3)

In [None]:
#Define Prompt Template, Retreiver, and Response Synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import get_response_synthesizer
from llama_index.core import PromptTemplate
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import QueryFusionRetriever
import Stemmer

qa_prompt = PromptTemplate(
    # "Context information is below.\n"
    # "---------------------\n"
    "You are a USCIS policy helper\n." 
    "You will be provided with a query about USCIS policies and guidelines and you must answer it clearly and provide detailed steps using only the context information and not any prior knowledge\n." 
    "If the steps need to follow a certain order then ensure that the order is stated clearly. If any mathematical calculations need to be done make sure to show them clearly. If any forms need to be filed, make sure to specify what those forms are. Also cite any actual URLs if required to provide more clarity and make sure that these URLs are not broken.\n"
    "Remember to always answer the question as if you were chatting with a person."
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}"
    "---------------------\n"
    "Given the context information and not prior knowledge, answer the query"
    "Query: {query_str}\n"
    "Answer: "
)

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k = 10,
    embed_model = Settings.embed_model
)

response_synthesizer = get_response_synthesizer(response_mode = "compact", llm = Settings.llm, text_qa_template = qa_prompt)

In [None]:
#Define Reranker
#The Reranker here compares each of the top_k retreived vectors to the query vector, reranks them
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

rerank = FlagEmbeddingReranker(model="BAAI/bge-reranker-base", top_n=5) #768 Dimension

In [None]:
#Setup the Query Engine
from IPython.display import display, Markdown
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors = [rerank]
)

In [None]:
#Setup LLM to allow for LLM only Response
from llama_index.llms.ollama import Ollama
llm_only = Ollama(model="llama3:instruct", request_timeout=360.0, temperature = 0.3, num_beams = 3)

### Getting Answers for Vector Rag and LLM on Test Set

In [None]:
#Load Test Set
import pandas as pd 

df = pd.read_csv("/Users/sameerprasadkoppolu/Desktop/MSDS Coursework/LLMs/Project/test_dataset/test_question_answers.csv")
df

In [None]:
from tqdm import tqdm 


for i in tqdm(range(len(df))):
    message = df.loc[i, 'question']
    df.loc[i, 'llm_only_response'] = llm_only.complete(message).text 
    df.loc[i, 'rag_response'] = query_engine.query(message).response 

df['embed_model'] = Settings.embed_model.model_name
df['embedding_dimension'] = 768
df['chunk_size'] = Settings.chunk_size 
df['chunk_overlap'] = Settings.chunk_overlap 
df['similarity_top_k'] = retriever.similarity_top_k 
df['rerank_top_n'] = rerank.top_n

df

In [None]:
#Define file name for Results
file_name = "test_results_" + "llama_3_8B_instruct_" + "snowflake-arctic-embed-m-v1.5_" +"_768_1024_100_10_3.csv"
file_name

In [None]:
#Save File to Path
df.to_csv(f"/Users/sameerprasadkoppolu/Desktop/MSDS Coursework/LLMs/Project/test_dataset/Results/{file_name}")

### Defining a Chat Engine for continued Conversations with Memory Store

In [None]:
#Create a Chat Engine
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core.chat_engine.context import ContextChatEngine
from llama_index.core.chat_engine.condense_plus_context import CondensePlusContextChatEngine

memory = ChatMemoryBuffer.from_defaults(token_limit=5000)

chat_qa_prompt = PromptTemplate(
    # "Context information is below.\n"
    # "---------------------\n"
    "You are a USCIS policy helper. Always introduce yourself and be courteous and friendly.\n." 
    "You will be provided with a query about USCIS policies and guidelines and you must answer it clearly and provide detailed steps using only the context information and not any prior knowledge\n." 
    "If the steps need to follow a certain order then ensure that the order is stated clearly. If any mathematical calculations need to be done make sure to show them clearly. If any forms need to be filed, make sure to specify what those forms are. Also cite any actual URLs if required to provide more clarity and make sure that these URLs are not broken.\n"
    "Remember to always answer the question using the Context information to enhance your internal knowledge."
    "Context information is below.\n"
    "---------------------\n"
    "{context_str}"
)



chat_engine = CondensePlusContextChatEngine.from_defaults(
    memory = memory,
    retriever = retriever,
    context_prompt = chat_qa_prompt,
    node_postprocessors = [rerank],
    response_synthesizer = response_synthesizer,
    llm = Settings.llm
)

In [None]:
#Use this to check memory (i.e. previous chat history)
memory.chat_store

In [None]:
#Chat with Chat Engine 
message = input()
if memory.chat_store.store == {}:
    chat_response = chat_engine.chat(message=message)
else:
    chat_response = chat_engine.chat(message=message,
                                  chat_history = memory.chat_store.store['chat_history'],)
display(Markdown(chat_response.response))

In [None]:
#To reset (i.e. delete Chat Memory)
chat_engine.reset()

### Generating Answers for Context Based Questions

In [None]:
context_df = pd.read_csv("/Users/sameerprasadkoppolu/Desktop/MSDS Coursework/LLMs/Project/test_dataset/responses_evaluation - responses_evaluation.csv")
context_df = context_df.iloc[:, :2]
context_df

In [None]:
for i in tqdm(range(len(context_df))):
    message = context_df.loc[i, 'Question']
    context_df.loc[i, 'llm_only_response'] = llm_only.complete(message).text 
    context_df.loc[i, 'vector_rag_response'] = query_engine.query(message).response 

context_df

### Evaluating With RAGAS Framework

In [13]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import os

os.environ["OPENAI_API_KEY"] = input("Enter OPEN AI API Key: ")

# generator with openai models
generator_llm = Settings.llm

critic_llm = Ollama(model="mistral-nemo")
embeddings = Settings.embed_model

generator = TestsetGenerator.from_llama_index(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings,
)

In [None]:
testset = generator.generate_with_llamaindex_docs(
    documents,
    test_size=5,
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
)

In [None]:
test_df = testset.to_pandas()
test_df.to_csv('Test set from RAGAS.csv')

In [None]:
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas.metrics.critique import harmfulness

metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    harmfulness,
]

In [None]:
ds = testset.to_dataset()

ds_dict = ds.to_dict()
ds_dict["question"]
ds_dict["ground_truth"]

In [None]:
evaluator_llm = Ollama(model="llama3.1", request_timeout=360.0)

In [None]:
from ragas.integrations.llama_index import evaluate

result_vector_only = evaluate(
    query_engine=query_engine,
    metrics=metrics,
    dataset=ds_dict,
    #llm=evaluator_llm,
    embeddings=Settings.embed_model,
    raise_exceptions=False,
    #run_config=RunConfig(max_retries=3, max_wait=20)
)

print(result_vector_only)