<a href="https://colab.research.google.com/github/rajaveljp/rajaveljp/blob/main/Table_Extraction_RAG_Local.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# End to End Implementation of Retrieval Augmented Generation (RAG) with Unstructured, LangChain and ChromaDB

In the following python notebook we will go over how to extract tables from quarterly earnings reports using Unstructured's python library. We will then chunk, embedd and store the tables in a vector database for retrieval.




In [None]:
!pip install chromadb

In [None]:
!pip install unstructured unstructured-inference

In [None]:
!pip install openai langchain

In [None]:
!sudo apt-get install poppler-utils
!sudo apt-get install tesseract-ocr


In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0


In [None]:
import os
import json
import pprint
import openai
import chromadb

from chromadb.utils import embedding_functions
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings

In [None]:
filename = "/Path/To/Your/File" # For this notebook I uploaded Nvidia's earnings into Google Colab's files directory called "/content/"
output_dir = "/Path/To/Your/Desired/Output" # I also put the output in the "/content" directory

In [None]:
# Define parameters for Unstructured's library
strategy = "hi_res" # Used for analyzing PDFs and extracting table structure
model_name = "yolox" # Best model for table extraction. Other options are detectron2_onnx and chipper depending on file layout

In [None]:
elements = partition_pdf(filename=filename, strategy=strategy, infer_table_structure=True, model_name=model_name)

In [None]:
elements_to_json(elements, filename=f"{filename}.json") # Takes a while for file to show up on the Google Colab

In [None]:
def process_json_file(input_filename):
    # Read the JSON file
    with open(input_filename, 'r') as file:
        data = json.load(file)

    # Iterate over the JSON data and extract required table elements
    extracted_elements = []
    for entry in data:
        if entry["type"] == "Table":
            extracted_elements.append(entry["metadata"]["text_as_html"])

    # Write the extracted elements to the output file
    with open("/content/nvidia-yolox.txt", 'w') as output_file:
        for element in extracted_elements:
            output_file.write(element + "\n\n")  # Adding two newlines for separation


In [None]:
process_json_file(f"{filename}.json") # Takes a while for the .txt file to show up in Colab

In [None]:
text_file = "/content/nvidia-yolox.txt"

In [None]:
loader = TextLoader(text_file)
documents = loader.load()

In [None]:
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)

In [None]:
os.environ['OPENAI_API_KEY'] = "<YOUR-OPENAI-API-KEY>"
embeddings = OpenAIEmbeddings()

In [None]:
db = Chroma.from_documents(docs, embeddings)

In [None]:
# Initialize your model and retriever
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever())

# List of questions
questions = [
    "How much revenue did Nvidia make in Q2 FY24?",
    "What was the operating income for Q2 FY24?",
    "How much was the net income in Q1 FY24?",
    "What was the Q/Q revenue growth between Q1 and Q2 of FY24?",
    "How much revenue did Nvidia's Data Center produce in Q2 FY24?",
    "How much revenue did Nvidia's Gaming sector produce in Q2 FY24?",
    "What percent of the total revenue in Q2 FY24 did Data Centers represent?"
]

# Store responses in output_list
output_list = []

for query in questions:
    response = qa_chain({"query": query})
    output_list.append(response)

In [None]:
# Use pprint to pretty print the output list
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(output_list)