In [5]:
# Use Case
"""
Ingestion:
1. Load files from directory
2. Ingest text extracted from file into vectorstore

Retrieval:
1. Use retrievrs to fetch relevant chunks/nodes/data from vectorstore/index
2. Use LLM to generate answers for queries

Llamaindex
1. Background
2. Usecases

"""


print()




# Setup

## Install

In [6]:
# !pip install llama-index
# !pip install openai
# !pip install tiktoken
!pip install llama-index-core
!pip install llama-index-llms-openai
!pip install llama-index-embeddings-huggingface
!pip install llama-index-readers-file
!pip install llama-index-retrievers-bm25

Collecting llama-index-core
  Downloading llama_index_core-0.10.53.post1-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json (from llama-index-core)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core)
  Downloading dirtyjson-1.0.8-py3-none-any.whl (25 kB)
Collecting httpx (from llama-index-core)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-cloud<0.0.7,>=0.0.6 (from llama-index-core)
  Downloading llama_cloud-0.0.6-py3-none-any.whl (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.8/130.8 kB[0m [31m16.2 MB

## Import

In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
embed_model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7c41fd81e440>, num_workers=None, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

In [8]:
import logging
import sys
import os
from IPython.display import Markdown, display
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.settings import Settings
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
os.listdir("/content/drive/MyDrive/colab_data/bits_webinar_rag")

['Q3 results.pdf']

In [11]:
os.listdir("/content/drive/MyDrive/colab_data/bits_webinar_rag_v2")

['pfizer_results.pdf']

In [None]:

# Use Case
"""
Ingestion:
1. Load files from directory
2. Ingest text extracted from file into vectorstore

Retrieval:
1. Use retrievrs to fetch relevant chunks/nodes/data from vectorstore
2. Use LLM to generate answers for queries

Llamaindex
1. Background
2. Usecases

"""


print()




# Basic RAG

In [14]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceSplitter

## Ingestion Service

In [25]:
# Load documents
# doc_loader = SimpleDirectoryReader("/content/drive/MyDrive/colab_data/bits_webinar_rag")
doc_loader = SimpleDirectoryReader("/content/drive/MyDrive/colab_data/bits_webinar_rag_v2")

documents = doc_loader.load_data()
len(documents)

2

In [26]:
documents[0]

Document(id_='e7274df8-b46f-4259-81df-4b0b8882b19d', embedding=None, metadata={'page_label': '1', 'file_name': 'pfizer_results.pdf', 'file_path': '/content/drive/MyDrive/colab_data/bits_webinar_rag_v2/pfizer_results.pdf', 'file_type': 'application/pdf', 'file_size': 45953, 'creation_date': '2024-07-10', 'last_modified_date': '2024-02-15'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Pfizer\nReports\nSecond-Quarter\n2023\nResults\n▪\nSecond-Quarter\n2023\nRevenues\nof\n$12.7\nBillion\n–\nExpected\nDecline\nin\nPaxlovid\nand\nComirnaty(1)\nRevenues\nDrove\n53%\nOperational\nDecrease\nin\nSecondQuarter\n2023\nRevenues\n–\nSecond-Quarter\n2023\nRevenues\nfrom\nComirnaty(1)\nand\nPaxlovid\nof\n$1.6\nBillion\n–\nExcluding\nContributions\nfrom\nComirnaty(1)

In [16]:
# Create text-splitter
text_splitter = SentenceSplitter(
  separator=" ",
  chunk_size=100,
  chunk_overlap=20,
  paragraph_separator="\n\n\n",
  secondary_chunking_regex="[^,.;。]+[,.;。]?"

)

In [17]:
# Set components
Settings.text_splitter = text_splitter
Settings.embed_model = embed_model

In [18]:
# Create-index
index = VectorStoreIndex.from_documents(
    documents, transformations=[text_splitter]
)

## Setup

In [19]:
os.environ['OPENAI_API_KEY'] = 'sk-xxxx'

## Retrieval Service

In [20]:
llm = OpenAI()
Settings.llm = llm

In [21]:
# Create Query Engine
query_engine = index.as_query_engine()

In [22]:
question = "Info about sales"

# Ask question
response = query_engine.query(question)
display(Markdown(f"<b>{response}</b>"))

<b>The company is planning to use the net proceeds to finance Pfizer's proposed acquisition of Seagen.</b>

In [30]:
# response

In [23]:
response.metadata

{'31f7d4fd-e1f7-46cb-a03c-551587403f64': {'page_label': '2',
  'file_name': 'pfizer_results.pdf',
  'file_path': '/content/drive/MyDrive/colab_data/bits_webinar_rag_v2/pfizer_results.pdf',
  'file_type': 'application/pdf',
  'file_size': 45953,
  'creation_date': '2024-07-10',
  'last_modified_date': '2024-02-15'},
 '218df663-47d3-424b-97b5-eaaa1fe68572': {'page_label': '2',
  'file_name': 'pfizer_results.pdf',
  'file_path': '/content/drive/MyDrive/colab_data/bits_webinar_rag_v2/pfizer_results.pdf',
  'file_type': 'application/pdf',
  'file_size': 45953,
  'creation_date': '2024-07-10',
  'last_modified_date': '2024-02-15'}}

# Advance RAG

In [32]:
import re
from llama_index.core.schema import TransformComponent
from llama_index.core.ingestion import IngestionPipeline

## Ingestion

In [33]:
# Load documents
doc_loader = SimpleDirectoryReader("/content/drive/MyDrive/colab_data/bits_webinar_rag")
documents = doc_loader.load_data()
len(documents)

11

In [34]:
# Create text-splitter
text_splitter = SentenceSplitter(
  separator=" ",
  chunk_size=200,
  chunk_overlap=20,
  paragraph_separator="\n\n\n",
  secondary_chunking_regex="[^,.;。]+[,.;。]?"

)

In [35]:
class TextCleaner(TransformComponent):
    def __call__(self, nodes, **kwargs):
        for node in nodes:
            # add pp steps here
            node.text = re.sub(r"[^0-9A-Za-z ]", "", node.text)
        return nodes

In [36]:
pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        TextCleaner(),
        embed_model,
    ],
)

In [37]:
documents_tmp = documents[:2]
nodes = pipeline.run(documents=documents_tmp)
len(nodes)

8

### Explore Nodes

In [None]:
from llama_index.core.schema import MetadataMode

In [None]:
node = nodes[0]
nodes[0].metadata

{'page_label': '1',
 'file_name': 'Q3 results.pdf',
 'file_path': '/content/drive/MyDrive/colab_data/bits_webinar_rag/Q3 results.pdf',
 'file_type': 'application/pdf',
 'file_size': 328710,
 'creation_date': '2024-07-10',
 'last_modified_date': '2024-07-06'}

In [None]:
print(node.get_content(metadata_mode=MetadataMode.LLM))

page_label: 1
file_path: /content/drive/MyDrive/colab_data/bits_webinar_rag/Q3 results.pdf

1  Clermont Ferrand  October 24 2023  545 pm   COMPAGNIE GNRALE DES TABLISSEMENTS MICHELIN   Ninemonth sales up 2 to 21 2 billion despite soft volumes and a forex headwind supported by mix enhancement non tire activities and  brand leadership    Nine month 2023 sellin markets in Europe and North America were shaped by inventory  drawdowns  o PCLT tire markets were stable  vs 2022  as robust OE demand in most regions offset slightly negative RT demand dampened by destocking in Europe and the Americas Demand for 18 inch and larger tires is steadily expanding


In [None]:
print(node.get_content(metadata_mode=MetadataMode.EMBED))

page_label: 1
file_path: /content/drive/MyDrive/colab_data/bits_webinar_rag/Q3 results.pdf

1  Clermont Ferrand  October 24 2023  545 pm   COMPAGNIE GNRALE DES TABLISSEMENTS MICHELIN   Ninemonth sales up 2 to 21 2 billion despite soft volumes and a forex headwind supported by mix enhancement non tire activities and  brand leadership    Nine month 2023 sellin markets in Europe and North America were shaped by inventory  drawdowns  o PCLT tire markets were stable  vs 2022  as robust OE demand in most regions offset slightly negative RT demand dampened by destocking in Europe and the Americas Demand for 18 inch and larger tires is steadily expanding


## Retrieval

In [38]:
from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine.retriever_query_engine import (
    RetrieverQueryEngine,
)
from llama_index.core.postprocessor.types import BaseNodePostprocessor
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core import PromptTemplate

In [39]:
# Retriever-1 : BM25
retriever_bm25 = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=4)

DEBUG:bm25s:Building index from IDs objects


BM25S Count Tokens:   0%|          | 0/8 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/8 [00:00<?, ?it/s]

In [40]:
# Retriever-2 : Semantic
retriever_basic = index.as_retriever(
    similarity_top_k=5,
)

In [41]:
# Retriever : Hybrid
retriever_fusion = QueryFusionRetriever(
    [retriever_basic, retriever_bm25],
    similarity_top_k=5,
    num_queries=4,  # set this to 1 to disable query generation
    mode="reciprocal_rerank",
    use_async=True,
    verbose=True
)


In [42]:
# Custom node post processor

class NodePostprocessorBasic(BaseNodePostprocessor):
    """
    This processing is applied mandatory at framework level
    """
    def _postprocess_nodes(
        self, nodes, query_bundle
    ):
        # add more steps here
        return nodes

In [45]:
# Query rewrite
prompt_basic = """\
You are a helpful assistant that re write the user queries. You need to perform below operations:
1. You only need to correct the grammer of the query.
2. Convert original query into qustion if it is already not a question.

Original Query: {query}
Corrected Query:
"""
qa_prompt = PromptTemplate(prompt_basic)

In [46]:
# Response generator
response_synthesizer = get_response_synthesizer(
    response_mode="compact", text_qa_template=qa_prompt
)

In [47]:
# Create query engine

retriever = retriever_fusion
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[NodePostprocessorBasic()],
    response_synthesizer=response_synthesizer,
)

In [48]:
# apply nested async to run in a notebook
import nest_asyncio

nest_asyncio.apply()

In [49]:
query_text = "Info abouttttttt saljdljsabdlj sales"
query = query_text

query_engine = RetrieverQueryEngine.from_args(retriever)

response = query_engine.query(
    query_text
)

from llama_index.core.response.notebook_utils import display_response
display_response(response)

Generated queries:
1. Sales statistics for the current quarter
2. Best practices for increasing sales in a competitive market
3. How to effectively track and analyze sales data for business growth


**`Final Response:`** Sales for the nine months ended September 30, 2023 were up 20% to 21.2 billion, supported by mix enhancement, non-tire activities, and the valorization of offers. In Q3, sales were stable excluding the currency effect, with tire sales volumes down 3.6% due to market destocking and a focus on value accretive segments. The price effect was at 6.2%, reflecting the value of products and solutions, and the mix effect reached 10%, driven by growth in the 18-inch and larger Passenger car tire segment.

In [50]:
response.metadata

{'753fa151-ada2-4b6a-b67d-b6dcb6c73225': {'page_label': '1',
  'file_name': 'Q3 results.pdf',
  'file_path': '/content/drive/MyDrive/colab_data/bits_webinar_rag/Q3 results.pdf',
  'file_type': 'application/pdf',
  'file_size': 328710,
  'creation_date': '2024-07-06',
  'last_modified_date': '2024-07-06'},
 'b53a159d-1a60-4d76-a873-9e4442fe9993': {'page_label': '1',
  'file_name': 'Q3 results.pdf',
  'file_path': '/content/drive/MyDrive/colab_data/bits_webinar_rag/Q3 results.pdf',
  'file_type': 'application/pdf',
  'file_size': 328710,
  'creation_date': '2024-07-06',
  'last_modified_date': '2024-07-06'},
 '314aee39-8894-4903-880a-1771b577759b': {'page_label': '1',
  'file_name': 'Q3 results.pdf',
  'file_path': '/content/drive/MyDrive/colab_data/bits_webinar_rag/Q3 results.pdf',
  'file_type': 'application/pdf',
  'file_size': 328710,
  'creation_date': '2024-07-06',
  'last_modified_date': '2024-07-06'},
 '7bb60404-a3a3-4768-8a11-6e1d9d2547a9': {'page_label': '2',
  'file_name': 'Q3