# RAG using LlamaIndex and KDB.AI vector store

## Install dependencies

In [1]:
# !pip install llama-index llama-index-embeddings-huggingface llama-index-llms-openai llama-index-readers-file llama-index-vector-stores-kdbai
# !pip install kdbai_client pandas wikipedia tqdm

## Download data

**Libraries**

In [2]:
import os
from tqdm import tqdm
import wikipedia
import urllib.request
import pandas as pd

**Data directories and paths**

In [3]:
# Root path
root_path = os.path.abspath(os.getcwd())

# Data directory and path
data_dir = "data"
data_path = os.path.join(root_data_path, data_dir)
if not os.path.exists(data_path):
    os.mkdir(data_path)

**Downloading data**

In [4]:
def download_data(WIKI_TITLES):
    text_metadata = dict()
    
    # Download data - text and images from wiki pages
    for title in tqdm(WIKI_TITLES):
        print(title)
        try:
            wiki_page = wikipedia.page(title)

            # Text - data and metadata
            page_content = wiki_page.content
            text_file_name = f"{title}.txt"
            content_path = os.path.join(data_path, f"{title}.txt")
            with open(content_path, 'w') as f:
                f.write(page_content)

            text_metadata[f"{root_data_dir}/{data_dir}/{title}.txt"] = {
                "title": title,
                "publication_date": pd.to_datetime("2024-07-23")
            }
    
        except Exception:
            print(str(Exception("No images found for Wikipedia page: ")) + title)
            continue

    return text_metadata

In [5]:
# Wikipedia titles to retrieve data
WIKI_TITLES = [
    "Beethoven",
    "Video Game"
]

text_metadata = download_data(WIKI_TITLES)

  0%|                                                     | 0/2 [00:00<?, ?it/s]

Beethoven


 50%|██████████████████████▌                      | 1/2 [00:03<00:03,  3.74s/it]

Video Game


100%|█████████████████████████████████████████████| 2/2 [00:06<00:00,  3.23s/it]


**Show texts**

In [6]:
# Helper function - to display downloaded texts
def show_texts(text_paths):
    texts_shown = 0
    for text_path in text_paths:
        with open(text_path, 'r') as text_file:
            text_content = text_file.read()
        print(text_content[0:512])
        print("="*80)

        texts_shown += 1
        if texts_shown >=3:
            break

In [7]:
text_paths = []
for text_path in text_metadata:
    text_paths.append(text_path)
show_texts(text_paths)

Ludwig van Beethoven (baptised 17 December 1770 – 26 March 1827) was a German composer and pianist. He is one of the most revered figures in the history of Western music; his works rank among the most performed of the classical music repertoire and span the transition from the Classical period to the Romantic era in classical music. His early period, during which he forged his craft, is typically considered to have lasted until 1802. From 1802 to around 1812, his middle period showed an individual developme
A video game or computer game is an electronic game that involves interaction with a user interface or input device (such as a joystick, controller, keyboard, or motion sensing device) to generate visual feedback from a display device, most commonly shown in a video format on a television set, computer monitor, flat-panel display or touchscreen on handheld devices, or a virtual reality headset. Most modern video games are audiovisual, with audio complement delivered through speakers

## KDB.AI session and table

**Libraries**

In [8]:
import kdbai_client as kdbai

**KDB.ai session**

In [9]:
# KDB.ai session
KDBAI_ENDPOINT = "http://localhost:8085"
session = kdbai.Session(endpoint=KDBAI_ENDPOINT)

**KDBai table**

In [10]:
# Table name and schema
table_name = "rag_docs"
table_schema = dict(
    columns=[
        dict(name="document_id", pytype="bytes"),
        dict(name="text", pytype="bytes"),
        dict(
            name="embedding",
            vectorIndex=dict(type="flat", metric="L2", dims=768),
        ),
        dict(name="title", pytype="bytes"),
        dict(name="publication_date", pytype="datetime64[ns]")
    ]
)

In [11]:
# Delete the table if exists
if table_name in session.list():
    session.table(table_name).drop()

In [12]:
# Create KDB.ai table with table name and schema
table = session.create_table(table_name, table_schema)

In [13]:
# Show table schema
table.schema()

{'columns': [{'name': 'document_id', 'qtype': 'string', 'pytype': 'bytes'},
  {'name': 'text', 'qtype': 'string', 'pytype': 'bytes'},
  {'name': 'embedding',
   'vectorIndex': {'type': 'flat', 'metric': 'L2', 'dims': 768},
   'qtype': 'reals',
   'pytype': 'float32'},
  {'name': 'title', 'qtype': 'string', 'pytype': 'bytes'},
  {'name': 'publication_date',
   'qtype': 'timestamp',
   'pytype': 'datetime64[ns]'}]}

## Load local PDF files with LlamaIndex

**Input file paths**

In [14]:
local_files = [input_file for input_file in text_metadata]
local_files

['data/rag_data/Beethoven.txt', 'data/rag_data/Video Game.txt']

**Loading data**

In [15]:
from llama_index.core import SimpleDirectoryReader

In [16]:
# Helper function - for getting metadata
def get_metadata(filepath):
    return text_metadata[filepath]

In [17]:
%%time

documents = SimpleDirectoryReader(input_files=local_files, file_metadata=get_metadata)

docs = documents.load_data()
len(docs)

CPU times: user 6.78 ms, sys: 7.74 ms, total: 14.5 ms
Wall time: 14.1 ms


2

## Setup LlamaIndex RAG pipeline using KDB.ai vector store

**OpenAI API Key**

In [18]:
from getpass import getpass

In [19]:
os.environ["OPENAI_API_KEY"] = getpass("OpenAI API key: ")

OpenAI API key:  ········


**Text embeddings model**

In [20]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [21]:
EMBEDDING = "sentence-transformers/all-mpnet-base-v2"
embeddings_model = HuggingFaceEmbedding(model_name=EMBEDDING)

**LLM model**

In [22]:
from llama_index.llms.openai import OpenAI

In [23]:
LLM = "gpt-3.5-turbo"
llm_model = OpenAI(temperature=0, model=LLM)

**Setting up LlamaIndex RAG pipeline**

In [24]:
from llama_index.vector_stores.kdbai import KDBAIVectorStore

from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex

from llama_index.core.node_parser import SentenceSplitter

In [25]:
%%time

# KDBAI vector store
vector_store = KDBAIVectorStore(table)

# LlamaIndex settings
Settings.embed_model = embeddings_model
Settings.llm = llm_model

# storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# vector store index
index = VectorStoreIndex.from_documents(
    docs,
    storage_context=storage_context,
    transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=0)],
)

CPU times: user 5.06 s, sys: 120 ms, total: 5.18 s
Wall time: 5.83 s


## Setup the LlamaIndex Query Engine

**Setting up Query Engine**

In [26]:
%%time

K = 5
query_engine = index.as_query_engine(llm=llm_model, similarity_top_k=K)

CPU times: user 1.42 ms, sys: 165 μs, total: 1.58 ms
Wall time: 4.98 ms


**Querying**

In [27]:
%%time

result = query_engine.query("""Tell me important fields in Computer Science ?""")
print(result.response)

Important fields in Computer Science include software development, artificial intelligence, data science, cybersecurity, computer networking, database management, computer graphics, human-computer interaction, and theoretical computer science.
CPU times: user 143 ms, sys: 4.5 ms, total: 147 ms
Wall time: 1.83 s


In [28]:
%%time

result = query_engine.query("""What are the most famous works by Beethoven ?""")
print(result.response)

Some of the most famous works by Beethoven include his Symphony No. 3 (Eroica), Symphony No. 5, Symphony No. 9 (Choral), Piano Sonata No. 14 (Moonlight Sonata), Piano Sonata No. 8 (Pathetique), and his Violin Concerto in D major.
CPU times: user 54.9 ms, sys: 384 μs, total: 55.3 ms
Wall time: 1.78 s
