# Building Accurate Retrieval Method

LLM is pulling context from vector db. This allows the 

Document Loader

load documents from list of filepaths
get metadata from each pdf for more context

In [144]:
# Get PDF files
import os
import requests
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 
import random
import re
import torch

list_of_pdf_docs = [
    "https://www.faa.gov/sites/faa.gov/files/regulations_policies/handbooks_manuals/aviation/airplane_handbook/00_afh_full.pdf",
    "https://www.faa.gov/sites/faa.gov/files/regulations_policies/handbooks_manuals/aviation/FAA-H-8083-15B.pdf",
    "https://www.faa.gov/regulations_policies/handbooks_manuals/aviation/faa-h-8083-25c.pdf",
    "https://www.faa.gov/sites/faa.gov/files/2022-06/risk_management_handbook_2A.pdf",
    "https://www.faa.gov/sites/faa.gov/files/regulations_policies/handbooks_manuals/aviation/FAA-H-8083-1.pdf",
]

## Extract Data from PDFs

data will be extracted form loaded pdf files. this data will be text, tables, and images

Below are multiple functions to extract as much data as possible from a PDF document.

Text, images, and soon tables are extracted then populate a dictionary for each page.

The text is cleaned before being populated to remove any noise. This will help increase the accuracy of embeddings further down the process.

In [99]:
### Document Loader

import pymupdf

# Extract text from PDF
def get_text(filepath):
    print("---GET_TEXT---")
    text = ""
    try:
        print("getting PDF")
        doc = pymupdf.open(filepath)
        print("getting tables")
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        print("completed")
    except Exception as e: 
        print(f"error occurred getting text from pdf: {e}")
    return text

# Extract tables from PDF
def get_tables(filepath):
    print("---GET_TABLES---")
    tables = []
    try:
        print("getting PDF")
        doc = pymupdf.open(filepath)
        print("getting tables")
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            tables.extend(page.get_tables())
        print("completed")
    except Exception as e:
        print(f"error occurred getting tables from PDF: {e}")
    return tables

# Extract images from page
def get_page_images(doc, page, page_index):
    # print("---GET_IMAGES---")
    image_paths =[]
    try:
        image_list = page.get_images()

        # print number of images found on page
        if image_list:
            print(f"found {len(image_list)} images on page {page_index}")

        for image_index, img in enumerate(image_list, start=1): # enumerate the image list
            xref = img[0] # get the XREF of image
            pix = pymupdf.Pixmap(doc, xref) # create a Pixmap

            if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
                pix = pymupdf.Pixmap(pymupdf.csRGB, pix)

            image_path = "page_%s-image_%s.png" % (page_index, image_index)
            pix.save(image_path) # save the image as png
            pix = None

            image_paths.append(image_path)  
              
    except Exception as e:
        print(f"error occurred getting images: {e}")

    return image_paths

# Clean text
def clean_text(text: str) -> str:
    """
    Formats text to remove noise
    """
    # Replace multiple dots (.............) with a single space
    text = re.sub(r'\.{2,}', ' ', text)

    # replace new line character with space
    clean_text = text.replace("\n", " ").strip()

    # Add more formatting if needed
    return clean_text

# Parse document
def parse_document(filepath):
    """
    This will extract all data from document and 
    populate a dictionary with the extracted data
    """

    print("---PARSING DOCUMENT---")
    doc = pymupdf.open(filepath) # open document
    pages_and_texts = []
    for page_number, page in enumerate(doc):
        text = page.get_text() # get text from page
        text = clean_text(text)
        pages_and_texts.append({
            "page_number": page_number,
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count": len(text.split(". ")),
            "page_token_count": len(text) / 4, # average token = ~4 char
            "images": get_page_images(doc, page, page_number),
            "text": text,
        })

    return pages_and_texts

Lets see if this code actually works...

We will parse a PDF file locally stored then view the data extracted form that document

Looks good...

In [100]:
# Test block

doc_data = parse_document("./pdf-files/61-65-certifications.pdf")

doc_data

---PARSING DOCUMENT---
found 1 images on page 0


[{'page_number': 0,
  'page_char_count': 573,
  'page_word_count': 91,
  'page_sentence_count': 3,
  'page_token_count': 143.25,
  'images': ['page_0-image_1.png'],
  'text': 'U.S. Department  of Transportation  Federal Aviation  Administration  Advisory  Circular  Subject: Certification: Pilots and Flight and  Ground Instructors  Date: 8/27/18  AC No: 61-65H  Initiated by: AFS-800  Change:  This advisory circular (AC) provides guidance for pilot and instructor applicants, pilots, flight  instructors, ground instructors, and examiners on the certification standards, knowledge test  procedures, and other requirements in Title 14 of the Code of Federal Regulations (14 CFR)  part 61.  Rick Domingo  Executive Director, Flight Standards Service'},
 {'page_number': 1,
  'page_char_count': 1436,
  'page_word_count': 323,
  'page_sentence_count': 1,
  'page_token_count': 359.0,
  'images': [],
  'text': '8/27/18  AC 61-65H  ii  CONTENTS  Paragraph  Page  1  Purpose of This Advisory Circular (A

**Get stats on page text**

Here we will gather an analysis of the average size of each page in terms of word count, character count, etc.

This will allow us to identify how to chunk the data to retain as much context as possible.

Some embedding models limit token input. So if the text you input is above that limit, you will loose information.

(1 token ~= 4 character ~= 0.75 words).

In [101]:
import pandas as pd

df = pd.DataFrame(doc_data)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,images,text
0,0,573,91,3,143.25,[page_0-image_1.png],U.S. Department of Transportation Federal Av...
1,1,1436,323,1,359.0,[],8/27/18 AC 61-65H ii CONTENTS Paragraph P...
2,2,533,119,3,133.25,[],8/27/18 AC 61-65H iii 31 Expired Flight ...
3,3,2546,400,26,636.5,[],8/27/18 AC 61-65H 1 1 PURPOSE OF THIS ADV...
4,4,2989,453,26,747.25,[],8/27/18 AC 61-65H 2 • Practical Test Standa...


Now that we are viewing our data within a DataFrame, we can manipulate it however necessary.


You can see below stats on word count, character count, and token count. 

Average token count is: 681, pretty big...

In [102]:
# get stats
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count
count,58.0,58.0,58.0,58.0,58.0
mean,28.5,2571.09,434.95,18.55,642.77
std,16.89,789.81,128.35,8.75,197.45
min,0.0,374.0,71.0,1.0,93.5
25%,14.25,2329.25,402.25,17.25,582.31
50%,28.5,2753.5,465.5,20.0,688.38
75%,42.75,3116.0,514.75,25.0,779.0
max,57.0,3675.0,644.0,34.0,918.75


A recommended method to process text before embedding is to break down text into chunks of sentences. There are multiple ways of accomplishing this tasks, this is just one way.

For a page, chunk groups of text into 5 - 10 sentences

`Get text --> split into chunks --> embed chunks --> use embeddings`

We will use spaCy to break text into sentences. This is a NLP library, therefore, it will be more accurate than splitting b: `text.split(". ")`.

In [103]:
from spacy.lang.en import English

nlp = English()

# add a sentencizer pipeline
nlp.add_pipe("sentencizer")

# create document instance for example
doc = nlp("This is a sentence. This is another sentence!")
assert len(list(doc.sents)) == 2

# access the sentences of document
list(doc.sents)

[This is a sentence., This is another sentence!]

In [104]:
for item in tqdm(doc_data):
    item["sentences"] = list(nlp(item["text"]).sents)

    # make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    # count the sentences
    item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 58/58 [00:00<00:00, 157.37it/s]


Now, lets inspect our results

In [105]:
# inspect example
random.sample(doc_data, k=1)

[{'page_number': 15,
  'page_char_count': 3409,
  'page_word_count': 591,
  'page_sentence_count': 20,
  'page_token_count': 852.25,
  'images': [],
  'text': '8/27/18    AC 61-65H  13  accordance with § 61.56 before conducting solo operations. Under § 61.56(g), a  certificated pilot is not considered a student pilot when seeking solo privileges in a  category/class aircraft in which the pilot is not already certificated.  20.7 Endorsement for Each Solo Cross-Country. In addition to the endorsements required  by § 61.93(c)(1) and (2), for each solo cross-country flight, a student pilot must have an  endorsement from an instructor after the instructor reviews the student’s cross-country  planning. That endorsement must comply with § 61.93(c)(3). An authorized instructor  may not permit a student pilot to conduct a solo cross-country flight unless the instructor  has complied with § 61.93(d)(1)–(5). This endorsement does not need to be made by the  instructor who normally provides traini

Now we will turn list of dictionaries into DataFrame to inspect furthur...

In [106]:
df = pd.DataFrame(doc_data)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy
count,58.0,58.0,58.0,58.0,58.0,58.0
mean,28.5,2571.09,434.95,18.55,642.77,17.64
std,16.89,789.81,128.35,8.75,197.45,8.42
min,0.0,374.0,71.0,1.0,93.5,1.0
25%,14.25,2329.25,402.25,17.25,582.31,16.25
50%,28.5,2753.5,465.5,20.0,688.38,20.0
75%,42.75,3116.0,514.75,25.0,779.0,22.75
max,57.0,3675.0,644.0,34.0,918.75,33.0


If you compare `page_sentence_count` and `page_sentence_count_spacy` you will see that pretty similar to each other

**Chunk sentences together**

Now we have a list of sentences. We need to group these sentences into larger chunks in order to embed and maintain some context in between sentences.

Why do we do this: 
1. Easier to manage similar size chunks of text
2. Don't overload the embedding models capacity for tokens
3. The LLM context window may be limited and require compute power, so we want to be efficient

On average we have 36 tokens per sentence (643 tokens per page / 18 sentences per page)

So we will chunk 10 sentences together (10 sentences * 36 tokens per sentence = 360). 
380 is usually the limit we don't want to pass.

In [107]:
# split size
sentences_in_chunk = 10

# recursively split list into desired sizes
def split_list(input_list: list,
               slice_size: int) -> list[list[str]]:
    """
    Split the input_list into sublists of size slice_size (as close as possible)
    """

    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# looper through pages and text and split sentences into chunks
for item in tqdm(doc_data):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                          slice_size=sentences_in_chunk)
    item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 58/58 [00:00<00:00, 167195.62it/s]


In [108]:
# sample example from group
random.sample(doc_data, k=2)

[{'page_number': 9,
  'page_char_count': 3675,
  'page_word_count': 644,
  'page_sentence_count': 27,
  'page_token_count': 918.75,
  'images': [],
  'text': '8/27/18    AC 61-65H  7  13.2 Testing From the Ground. With certain limitations, a DPE or an FAA ASI may conduct  the practical test for a sport pilot certificate in an LSA having a single seat. Refer to  § 61.45(f) for specific provisions. The examiner must agree to conduct the test  (§ 61.45(f)(1)) and must conduct the practical test in accordance with the sport pilot PTS  or ACS, as appropriate. Knowledge of all tasks applicable to their category/class of  aircraft will be evaluated orally. The examiner must be in a position to observe the  operation of the aircraft while evaluating the proficiency of the applicant (§ 61.45(f)(2)).  13.3 Single-Seat Limitation. The limitation, “No passenger carriage and flight in a single-seat  light-sport aircraft only,” will be placed on the person’s pilot certificate per § 61.45(f)(3).  Onl

In [110]:
# DataFrame to get stats
df = pd.DataFrame(doc_data)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,page_sentence_count_spacy,num_chunks
count,58.0,58.0,58.0,58.0,58.0,58.0,58.0
mean,28.5,2571.09,434.95,18.55,642.77,17.64,2.21
std,16.89,789.81,128.35,8.75,197.45,8.42,0.77
min,0.0,374.0,71.0,1.0,93.5,1.0,1.0
25%,14.25,2329.25,402.25,17.25,582.31,16.25,2.0
50%,28.5,2753.5,465.5,20.0,688.38,20.0,2.0
75%,42.75,3116.0,514.75,25.0,779.0,22.75,3.0
max,57.0,3675.0,644.0,34.0,918.75,33.0,4.0


Our average chunks per page is 2.2. This makes sense because the average amount of sentences per page is 18. 

**Splitting each chunk into its own item**

Now we will embed each chunk into its own numerical representation.

For clarity, new list of dictionaries containing a single chunk of sentences with relative information (metadata) and stats.

In [111]:
import re

# split each chunk into its own item
pages_and_chunks = []
for item in tqdm(doc_data):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]

        # join sentences together into a paragraph-like structure, aka a chunk (single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full stop/capital letter combo
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # get stats about chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 char

        pages_and_chunks.append(chunk_dict)

# amount of chunks
len(pages_and_chunks)

100%|██████████| 58/58 [00:00<00:00, 12259.10it/s]


128

In [116]:
# view random sample
random.sample(pages_and_chunks, k=1)

[{'page_number': 12,
  'sentence_chunk': 'After verifying the application is complete, forward this application to the responsible FSDO for data entry and processing, even if the applicant does not meet the eligibility requirements. Find the address and contact information for the responsible FSDO at http://www.faa.gov/about/office_org/field_offices/fsdo/. Note: If the applicant does not meet the eligibility requirements of § 61.83, DO NOT check the “Accepted Student Pilot Application” box. The “Accepted Student Pilot Application” box should only be checked once it is determined the applicant meets all eligibility requirements. Check the “Rejected Student Pilot Application” box and continue processing the application. The terms “accepted” and “rejected” are used to notify the Civil Aviation Registry that the applicant meets, or may not meet, the requirements for the issuance of a Student Pilot Certificate. If the applicant does not meet the eligibility requirements for English language

Now the PDF is broken into chunks of 10 sentences

This means we can reference a chunk of text and know its source

stats on chunks:

In [113]:
# stats on chunks
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,128.0,128.0,128.0,128.0
mean,27.23,1144.62,177.23,286.16
std,16.7,489.96,77.43,122.49
min,0.0,8.0,1.0,2.0
25%,12.0,882.75,139.0,220.69
50%,26.0,1218.5,187.5,304.62
75%,42.25,1414.25,226.5,353.56
max,57.0,2271.0,382.0,567.75


This is great, our average token count per chunk is below the limit for our embedding model. 

We can also see that the count matches our earlier test (128)

Some chunks are pretty small...the min is 2 tokens in a chunk

To fix this issue, we can find chunks with token counts less than 10

In [137]:
# show random chunks with under 10 tokens in length
min_token_length = 10
for row in df[df["chunk_token_count"] <= min_token_length].sample(1).iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]} | Page number: {row[1]["page_number"]}')

Chunk token count: 2.0 | Text: 12-31-19 | Page number: 46


There is still some data that could be useful for context. These all have a token count above 10 though. We will remove the token count of 2

In [138]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': 0,
  'sentence_chunk': 'U. S. Department of Transportation Federal Aviation Administration Advisory Circular Subject: Certification: Pilots and Flight and Ground Instructors Date: 8/27/18 AC No: 61-65H Initiated by: AFS-800 Change: This advisory circular (AC) provides guidance for pilot and instructor applicants, pilots, flight instructors, ground instructors, and examiners on the certification standards, knowledge test procedures, and other requirements in Title 14 of the Code of Federal Regulations (14 CFR) part 61. Rick Domingo Executive Director, Flight Standards Service',
  'chunk_char_count': 557,
  'chunk_word_count': 75,
  'chunk_token_count': 139.25},
 {'page_number': 1,
  'sentence_chunk': '8/27/18 AC 61-65H ii CONTENTS Paragraph Page 1 Purpose of This Advisory Circular (AC) 1 2 Audience 1 3 Safety Message 1 4 Where You Can Find This AC 1 5 What This AC Cancels 1 6 Related Reading Material (current editions)  1 7 Summary of Changes  2 8 Pilot Training and Tes

**Embedding text chunks**

Embedding will turn text into a n-dimension vector. This vector holds the semantic meaning of the chunk and allows a computer to understand it better.

The goal here is to turn the chunks into an embedding vector.

This notebook will use `sentence-transformers` 

In [163]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu") # choose the device to load model to

# Notes: this will embed using local computing power. Learn more about the benefits (if any)
# of computing in the cloud

single_sentence = "this is a single sentence."
single_embedding = embedding_model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

type(single_embedding)



Sentence: this is a single sentence.
Embedding:
[ 2.15313751e-02  2.73083849e-03  1.18756639e-02  2.44902857e-02
 -5.05314730e-02  1.96375493e-02  5.77450246e-02  5.01573365e-03
 -3.51084694e-02 -7.63766933e-03  2.54492182e-02 -1.95119565e-03
  2.18195785e-02 -5.32809608e-02  3.14275995e-02 -2.64189225e-02
  7.93224871e-02  1.81338433e-02  2.93792458e-03  5.09483740e-03
  8.36728048e-03  4.24091239e-03  3.87116782e-02  2.42951256e-03
 -7.54352566e-03 -4.28659320e-02 -1.25959404e-02 -3.01759522e-02
 -1.89403351e-02 -3.44804749e-02  1.69881117e-02 -4.24740079e-04
 -1.82319302e-02 -6.29487261e-02  1.97511531e-06  1.53348250e-02
 -5.93721494e-03 -2.95606945e-02 -6.08280934e-02  4.73272987e-02
 -7.36614689e-02  2.33938098e-02  3.66637832e-03  4.47262265e-02
  6.39448501e-03  8.02465305e-02  2.33598594e-02  6.56794757e-02
 -2.29862742e-02  6.21852130e-02  1.80407005e-04 -6.98949397e-02
  1.36415586e-02 -1.76094770e-02  1.67349484e-02  2.21291576e-02
  1.92022379e-02 -4.77676541e-02 -1.645993

numpy.ndarray

The embedding shape: `(768,)` means the embedding is a vector with 768 dimensions

Now we will add embedding field to each chunk

Will start with trying to embed local on the CPU. While we do this we will use `%%time` to see how long it takes

In [159]:
%%time


# Uncomment to see how long it takes to create embeddings on CPU
# Make sure the model is on the CPU
embedding_model.to("cpu")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 127/127 [00:22<00:00,  5.70it/s]

CPU times: user 2min 48s, sys: 7.47 s, total: 2min 55s
Wall time: 22.5 s





This didn't take too long...but its now noticeable that a larger dataset will need more computer power, or else it will take pretty long.

In [145]:
%%time

# Check if MPS is available and set the device
# device = 'mps' if torch.backends.mps.is_available() else 'cpu'
# embedding_model.to(device)

# # Embed each chunk one by one
# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 127/127 [01:14<00:00,  1.71it/s]

CPU times: user 2min 39s, sys: 11.4 s, total: 2min 50s
Wall time: 1min 14s





This took much longer using the GPU on the MacBook. It would be best to stick with the CPU. 

What we could experiment with though, is using a GPU such as the Nvidia Geforce on the windows machine


In the meantime we can increase the speed by handling batched predictions. This will compute multiple samples at once.

This will perform batched operations by turning text samples into a single list and then passing that list to our embedding model.

In [146]:
# turn text chunks into single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [147]:
%%time

# embed all text in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                                batch_size=32, # you can use different batch sizes for speed/peformance
                                                convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: user 6.32 s, sys: 968 ms, total: 7.29 s
Wall time: 27.6 s


tensor([[-0.0060, -0.0327, -0.0125,  ..., -0.0285,  0.0053, -0.0359],
        [ 0.0165, -0.0025,  0.0069,  ..., -0.0388, -0.0244, -0.0341],
        [ 0.0468, -0.0046,  0.0281,  ..., -0.0486, -0.0220, -0.0318],
        ...,
        [ 0.0342, -0.0009, -0.0039,  ..., -0.0378, -0.0290, -0.0353],
        [ 0.0364, -0.0775,  0.0157,  ..., -0.0401, -0.0294, -0.0569],
        [-0.0008, -0.0429, -0.0078,  ..., -0.0224, -0.0170, -0.0680]],
       device='mps:0')

This was still slower than the CPU. So for now, we will stick with the CPU.

**Save embeddings to file**

In [166]:
# Save to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [167]:
# import saved file and view
text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)
text_chunks_and_embeddings_df_load.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,0,U. S. Department of Transportation Federal Avi...,557,75,139.25,[-6.03953516e-03 -3.27171981e-02 -1.24537116e-...
1,1,8/27/18 AC 61-65H ii CONTENTS Paragraph Page 1...,1344,231,336.0,[ 1.64755117e-02 -2.51875585e-03 6.85117068e-...
2,2,8/27/18 AC 61-65H iii 31 Expired Flight Instr...,495,81,123.75,[ 4.67593521e-02 -4.63527534e-03 2.81367991e-...
3,3,8/27/18 AC 61-65H 1 1 PURPOSE OF THIS ADVISOR...,1112,158,278.0,[-5.55143319e-03 -5.90340756e-02 3.23559437e-...
4,3,This AC is an aviation safety resource that he...,971,142,242.75,[-3.57428528e-02 -7.42107257e-02 1.04744267e-...


**Save embeddings to Pinecone**

Now that our embeddings are created we need to save them to a vector database

We will need to create an Pinecone index. This will be the location where we will store our embeddings. 

First, we will initialize a connection

In [148]:
# Initialize connection
import dotenv
dotenv.load_dotenv()

from pinecone import Pinecone

# configure client
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


Now that we configured a client to initialize a connection we will setup an index. 

There might be one already created so we need to take that into consideration.

In [151]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [152]:
index_name = "rag-retriever-v1"

In [155]:
# check if index already exists
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=768,
        metric="cosine",
        spec=spec,
    )
#connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

There is currently no vector embeddings inside the index. Now we can start adding embeddings

In [172]:
# add ids to embeddings
ids = [str(i) for i in range(len(pages_and_chunks_over_min_token_len))]

# embeddings
embeddings_list = [item.get("embedding") for item in pages_and_chunks_over_min_token_len if item.get("embedding") is not None]

# function to batch embeddings
batch_size = 10 # adjust if needed

def batch_data(embeddings, ids, batch_size):
    for i in range(0, len(embeddings_list), batch_size):
        yield ids[i:i+batch_size], embeddings[i:i+batch_size]

# upsert embeddings to Pinecone in batches
for batch_ids, batch_embeddings in tqdm(batch_data(embeddings_list, ids, batch_size), total=len(embeddings_list) // batch_size):
    vectors = [
        (batch_ids[j], batch_embeddings[j].tolist())
        for j in range(len(batch_ids))
    ]
    index.upsert(vectors=vectors)
    

13it [00:03,  3.79it/s]                        
