## Notebook to show load of pdf to KDB.AI Vector DB 

### Imports

In [3]:
import os
import pypdf
from nltk.tokenize import sent_tokenize
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import kdbai_client as kdbai
from getpass import getpass
import glob

### Load pdf files, tokenize into separate sentences

In [4]:
pdf_files = glob.glob("../Documents/pwc_*.pdf")
all_pdf_texts = []
# Loop through the list of files and open each one
for pdf_file in pdf_files:
    with open(pdf_file, "rb") as file:
        pdf_reader = pypdf.PdfReader(file)
        pdf_pages = pdf_reader.pages
        page_list = [page.extract_text() for page in pdf_pages]
        full_pdf_text = "".join(page_list)
        all_pdf_texts.append(full_pdf_text)
# Join all texts from all PDFs into one single text
combined_pdf_text = "".join(all_pdf_texts)
# Tokenize the combined text into sentences
ai_sentences = sent_tokenize(combined_pdf_text)
# Get the number of sentences
len(ai_sentences)

15

In [5]:
ai_sentences

['ft.comPwC India boss pleads case for seat atﬁrm’s global top tableSimon Foy, Stephen Foley4–5 λεπτάPwC’s India boss is lobbying for a seat on the Big Four ﬁrm’s globalexecutive committee alongside his counterpart in China, arguingthat the fast growth of its business and the rising importance of theIndian economy merit a position at the $53bn network’s top table.Sanjeev Krishan, chair of PwC India, in recent months petitionedsenior ﬁgures for him to be added to the ﬁrm’s inﬂuential networkleadership team, people familiar with the matter told the FinancialTimes.The move has caused ructions within the accounting and consultingﬁrm, with one senior ﬁgure familiar with the discussions saying thatwhile the issue has not yet risen to the level of “tensions”, it isregarded by some within the network as a “problem”.One partner said Krishan had been “pushing very hard” for a seatwhile another, who has held global and national leadership roles atthe ﬁrm, said it was “highly unlikely” that the In

### Load model for embedding and apply it on the sentences 

In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
embeddings_list = model.encode(np.array(ai_sentences)).tolist()
embeddings_df = pd.DataFrame({"vectors": embeddings_list, "sentences": ai_sentences})

### Connect to KDB.AI cloud edition and init session

In [8]:
KDBAI_ENDPOINT = (
    os.environ["KDBAI_ENDPOINT"]
    if "KDBAI_ENDPOINT" in os.environ
    else getpass("KDB.AI endpoint: ")
)
KDBAI_API_KEY = (
    os.environ["KDBAI_API_KEY"]
    if "KDBAI_API_KEY" in os.environ
    else getpass("KDB.AI API key: ")
)

KDB.AI endpoint:  ········
KDB.AI API key:  ········


In [9]:
session = kdbai.Session(api_key=KDBAI_API_KEY, endpoint=KDBAI_ENDPOINT)

### Create schema for the table to create and the table itself

#### Note dims (number of dimensions), type of index and metric L2 which is Euclidean distance

In [10]:
pwc_schema = {
    "columns": [
        {"name": "sentences", "pytype": "str"},
        {
            "name": "vectors",
            "vectorIndex": {"dims": 384, "metric": "L2", "type": "hnsw"},
        },
    ]
}

In [11]:
try:
    session.table("pwc_pdf").drop()
    time.sleep(5)
except kdbai.KDBAIException:
    pass

In [12]:
table = session.create_table("pwc_pdf", pwc_schema)

In [13]:
table.query()

Unnamed: 0,sentences,vectors


### Insert embedded data to table

In [14]:
table.insert(embeddings_df)

True

In [16]:
table.query()

Unnamed: 0,sentences,vectors
0,ft.comPwC India boss pleads case for seat atﬁr...,"[-0.03110901080071926, -0.06976129859685898, -..."
1,The ﬁrm’s network leadership team sets overall...,"[-0.012049359269440174, -0.0012233656598255038..."
2,“There are no current plans to add any additio...,"[-0.003842049278318882, -0.032716259360313416,..."
3,One of the seniorpartners said: “India is very...,"[-0.03188921883702278, -0.055128637701272964, ..."
4,That performance propped up PwC’s overall Asia...,"[-0.041160259395837784, 0.03492829203605652, 0..."
5,The Indian ﬁrmexpects to surpass $1bn in sales...,"[-0.014143252745270729, 0.03203411027789116, -..."
6,PwC’sChina boss was added to the ﬁrm’s network...,"[-0.060793209820985794, 0.002126440405845642, ..."
7,"However, the Indian business remains small com...","[0.09558597952127457, 0.0015580591280013323, -..."
8,“To put them on thePwC India boss pleads case ...,"[-0.024587363004684448, -0.023464664816856384,..."
9,Suddenly the network leadership teamhas 10 cou...,"[0.008721278049051762, -0.04857706278562546, 0..."


#### Run a similarity search based on a query and the embedded data

In [18]:
query="New boss of PwC?"
encoded_query = model.encode(query).tolist()
results = table.search([encoded_query], n=1)  # Get top 1 result
df = pd.DataFrame(results[0])
top_sentence = df.loc[0, 'sentences']
str(top_sentence)

'That would not be workable.”PwC’s network leadership team, which met 25 times last year, isalready set to be overhauled at the start of next month, whenKande, Marco Amitrano and Daniel Li take up their positions asrespective heads of PwC global, UK and Asia-Paciﬁc.'