In [None]:
# Experimenting with creating a new RAG system from a HuggingFace dataset and text corpus
# https://huggingface.co/datasets/rag-datasets/rag-mini-wikipedia

In [77]:
import numpy as np
import pandas as pd
import os
import logging
import sys
import time

# Replace these two Google Gemini imports with imports for your LLM
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from langchain_community.document_loaders import DirectoryLoader
from llama_index.core import Document, VectorStoreIndex, Settings, StorageContext, load_index_from_storage
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss

# trying new faiss...
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [38]:
# Set up local API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [13]:
# load corpus
corpus = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

In [18]:
corpus['passage'][0]

'Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.'

In [32]:
# load query & answer dataset
query_answers = pd.read_parquet("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")

In [None]:
# # Create a Faiss vector store for RAG
# # If you already have an index created, skip a few coding cells to the LLM / embeddings setup

In [40]:
# convert corpus into Document format
documents = [Document(text=passage) for passage in corpus['passage']] 

In [44]:
corpus['passage'][0]

'Uruguay (official full name in  ; pron.  , Eastern Republic of  Uruguay) is a country located in the southeastern part of South America.  It is home to 3.3 million people, of which 1.7 million live in the capital Montevideo and its metropolitan area.'

In [101]:
corpus

Unnamed: 0_level_0,passage
id,Unnamed: 1_level_1
0,"Uruguay (official full name in ; pron. , Eas..."
1,"It is bordered by Brazil to the north, by Arge..."
2,Montevideo was founded by the Spanish in the e...
3,The economy is largely based in agriculture (m...
4,"According to Transparency International, Urugu..."
...,...
3196,"*In 2007, a duck in Tallahassee, Florida survi..."
3197,*A rare genetic mutation sees some ducks born ...
3198,*The Moche people of ancient Peru worshipped n...
3199,*Angel Wing - A disease common in ducks.


In [102]:
test = documents[0:1000]
test2 = documents[1000:2000]
test3 = documents[2000:3201]

In [103]:
test3

[Document(id_='13269335-993d-4786-9622-1aab06facec7', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="Ford joined the Boy Scouts of America, and attained that program's highest rank, Eagle Scout. He always regarded this as one of his proudest accomplishments, even after attaining the White House.     In subsequent years, Ford received the Distinguished Eagle Scout Award in May 1970 and Silver Buffalo Award from the Boy Scouts of America. He is the only US president who was an Eagle Scout.    Scouting was so important to Ford that his family asked that Scouts participate in his funeral. About 400 Eagle Scouts were part of the funeral procession, where they formed an honor guard as the casket went by in front of the museum, and served as ushers.", mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Docu

In [108]:
test3[-2]

Document(id_='bb927233-c61f-45c8-bfe7-058dd8b668ed', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='*Angel Wing - A disease common in ducks.', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [None]:
# try dividing documents into smaller parts for ingestion

In [81]:
# # Set up the faiss index
d = 768 # dimensions of the input vector of the embedding model that we're going to use; in this case, the google embedding model
faiss_index = faiss.IndexFlatL2(d)
print(faiss_index.is_trained) # double check that the training worked

True


In [82]:
# Set up the llm, embeddings, and Settings for Faiss 
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") # Replace with your LLM
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", embed_batch_size=1) # Replace with your embeddings model
Settings.embed_model = doc_embeddings # used for LlamaIndex FaissVectorStore
Settings.llm = llm # used for LlamaIndex FaissVectorStore

In [83]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [84]:
# # Uncomment for when you need to re-embed and vectorize documents

vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    test, storage_context=storage_context, show_progress=True
)



Parsing nodes:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

In [169]:
len(index.ref_doc_info)

1950

In [165]:
test4 = documents[1850:1950] # stopped here

In [166]:
test4[-1]

Document(id_='d612c739-15bb-48b2-befa-0c20dc2646af', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="When he was 17, his mother died and his father's health had begun to fail.  Watt travelled to London to study instrument-making for a year, then returned to Scotland   to Glasgow   intent on setting up his own instrument-making business. However, because he had not served at least seven years as an apprentice, the Glasgow Guild of Hammermen (any artisans using hammers) blocked his application, despite there being no other mathematical instrument makers in Scotland.", mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [168]:
'd612c739-15bb-48b2-befa-0c20dc2646af' in index.ref_doc_info

True

In [167]:
# insert one at a time
for i in test4:
    index.insert(i)
    time.sleep(1)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

In [155]:
# index.ref_doc_info # gives list of all nodes
# 8e83bd67-3100-42ae-8820-64651c63a726 is 0th doc

# updating an index
# https://docs.llamaindex.ai/en/stable/module_guides/indexing/document_management/

index.refresh_ref_docs(test4) # to add new documents to index as well as to update existing docs as needed

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [170]:
# Save index to disk
index.storage_context.persist()

# Save/remember index id for loading next time
index.index_id

# '95634851-570e-454e-983f-6634eeb72aee' is 1950 stopped indexd

'95634851-570e-454e-983f-6634eeb72aee'

In [None]:
# trying new faiss...
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") # Replace with your LLM
doc_embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", embed_batch_size=1) 

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)