In [None]:
# Experimenting with creating a new RAG system from a HuggingFace dataset and text corpus
# https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia

In [1]:
import numpy as np
import pandas as pd
import os
import logging
import sys
import time

# Replace these two Google Gemini imports with imports for your LLM
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss

# trying new faiss...
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [2]:
# Set up local API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [3]:
# load corpus
# corpus = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")
# corpus = pd.read_csv("datasets/rag_mini_wikipedia_corpus.csv", index_col=['id'])

In [111]:
# corpus

In [5]:
# load query & answer dataset
# query_answers = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")

In [None]:
# # Create a Faiss vector store for RAG
# # If you already have an index created, skip a few coding cells to the LLM / embeddings setup

# convert corpus into Document format
# documents = [Document(text=passage) for passage in corpus['passage']] 

In [4]:
# # Set up the faiss index
d = 768 # dimensions of the input vector of the embedding model that we're going to use; in this case, the google embedding model
faiss_index = faiss.IndexFlatL2(d)
print(faiss_index.is_trained) # double check that the training worked

True


In [5]:
# Set up the llm, embeddings, and Settings for Faiss 
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") # Replace with your LLM
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", embed_batch_size=1) # Replace with your embeddings model
Settings.embed_model = doc_embeddings # used for LlamaIndex FaissVectorStore
Settings.llm = llm # used for LlamaIndex FaissVectorStore

In [6]:
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
# # Uncomment for when you need to re-embed and vectorize documents

# vector_store = FaissVectorStore(faiss_index=faiss_index)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
# index = VectorStoreIndex.from_documents(
#     test, storage_context=storage_context, show_progress=True
# )


In [None]:
# My rate limits were exceeded when I tried to index the entire corpus, 
# so I had to divide the documents into smaller sections (150 at a time) for embedding and indexing

# try dividing documents into smaller parts for ingestion
fewer_docs = documents[0:150] 

# updating an index
# https://docs.llamaindex.ai/en/stable/module_guides/indexing/document_management/

# insert one document at a time, waiting between insertion
for i in fewer_docs:
    index.insert(i)
    time.sleep(5)

# index.refresh_ref_docs(fewer_docs) # Alternatively, to add new documents to index as well as to update existing docs as needed

In [102]:
# checking length of index to be sure all documents were indexed
len(index.ref_doc_info) # gives list of all nodes

3200

In [101]:
# # Save index to disk
# index.storage_context.persist()

# # Save/remember index id for loading next time
# # '95634851-570e-454e-983f-6634eeb72aee' is index for rag_mini_wikipedia dataset
# index.index_id

'95634851-570e-454e-983f-6634eeb72aee'

In [8]:
# After you have a saved index, load that index for RAG answer generation:

# load index from disk
vector_store = FaissVectorStore.from_persist_dir("./storage")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./storage"
)
# My local index id '95634851-570e-454e-983f-6634eeb72aee' contains 3200 documents from the rag_mini_wikipedia dataset
index = load_index_from_storage(storage_context=storage_context, index_id='95634851-570e-454e-983f-6634eeb72aee')

INFO:root:Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
Loading llama_index.vector_stores.faiss.base from ./storage/default__vector_store.json.
INFO:llama_index.core.indices.loading:Loading indices with ids: ['95634851-570e-454e-983f-6634eeb72aee']
Loading indices with ids: ['95634851-570e-454e-983f-6634eeb72aee']


In [11]:
# # Optional- if you'd like to query your index
# # Set up query and chat engines with the index
query_engine = index.as_query_engine(similarity_top_k=10)
# chat_engine = index.as_chat_engine(similarity_top_k=10, chat_mode='context')

In [None]:
# # Example query and response with Gemini and query_engine
# query = "Where was Lincoln born?"
# response = query_engine.query(query) 
# print(response.response)

In [14]:
# # Get ranked scores for top k RAG source nodes
for node in response.source_nodes:
    print(f"{node.get_score()} -> {node.text}")

# ToDo: I'm not sure why the node scores are opposite ranked (lower is better) than what I expect, 
# but the text itself / node order of ranking matches most relevant to least relevant

0.5635491609573364 -> Abraham Lincoln was born on February 12, 1809, to Thomas Lincoln and Nancy Hanks, two uneducated farmers. Lincoln was born in a one-room log cabin on the   Sinking Spring Farm, in southeast Hardin County, Kentucky (now part of LaRue County). This area was at the time considered the "frontier." The name "Abraham" was chosen to commemorate his grandfather, who was killed in an American Indian raid in 1786. Donald (1995) p 21  His elder sister, Sarah Lincoln, was born in 1807; a younger brother, Thomas Jr, died in infancy. It is sometimes debated whether Lincoln had Marfan syndrome, an autosomal dominant disorder of the connective tissue characterized by long limbs and great physical stature.  Marfan syndrome: Introduction Aug 1, 2006
0.6031718850135803 -> Lincoln's birthplace and family home are national historic memorials: the Abraham Lincoln Birthplace National Historic Site in Hodgenville, and the Lincoln Home National Historic Site in Springfield, Illinois. The 

In [19]:
# Need to populate the HuggingFace dataset with answers & context for evaluation
testset_pd = pd.read_csv("datasets/rag_mini_wikipedia_query_answers.csv", index_col = ['id'])

In [None]:
# Generate answers from the LLM
query_engine = index.as_query_engine(similarity_top_k=10)
# answers = [query_engine.query(q) for q in testset_pd['question']] # if not concerned on rate limiting

# to avoid rate limit issues:
answers = []
for q in testset_pd['question']:
    answer = query_engine.query(q)
    answers.append(answer)
    time.sleep(5)

In [96]:
# Parse out new 'answer' and 'contexts' columns
answers_new = []
context_new = []
for i in answers:
    answers_new.append(i.response)
    context_new.append([c.node.get_content() for c in i.source_nodes])

In [None]:
testset_pd = testset_pd.rename(columns={"answer":"ground_truth"}) # Keeping original answer as the ground truth answer
testset_pd['contexts'] = context_new
testset_pd['answer'] = answers_new

In [107]:
# # Save newly completed dataset
# testset_pd.to_csv('datasets/rag_mini_wikipedia_complete.csv', index=True, index_label=["id"])