In [1]:
import os
import requests
import fitz
from tqdm.auto import tqdm
import random
import pandas as pd 
from spacy.lang.en import English
import re
from sentence_transformers import SentenceTransformer
import torch
import numpy as np 
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 

In [2]:
pdf_path = "Human-nutrition-text.pdf"

In [None]:

if not os.path.exists(pdf_path):
    print("file doesn't exist , downloading ...")
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"

    filename = pdf_path

    response = requests.get(url)

    if response.status_code == 200 :
        # open the file in binary write mode
        with open(filename , 'wb') as file:
            file.write(response.content)
        print(f"file has been downloaded and saved as {filename}")
    else :
        print(f"download failed")
else :
    print(f"{pdf_path} already exist")

Human-nutrition-text.pdf already exist


Read the PDF file and convert it into chunks

In [6]:
def open_and_read_pdf(pdf_path : str) -> list[dict] :

    doc = fitz.open(pdf_path)
    pages_and_texts = []

    for page_number , page in tqdm(enumerate(doc)) :
        text = page.get_text()
        pages_and_texts.append({'page_numer' : page_number - 41,
                                'page_char_count' : len(text),
                                'page_word_count' : len(text.split(" ")),
                                'page_sentence_count_raw' : len(text.split(". ")),
                                'page_token_count' : len(text) /4 ,
                                'text' : text
                                })
        
    return pages_and_texts
        
def text_formatter(text : str) -> str :
    cleaned_text = text.replace("\n", " ").strip()

    return cleaned_text

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

0it [00:00, ?it/s]

In [7]:
df = pd.DataFrame(pages_and_texts)
df

Unnamed: 0,page_numer,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,31,5,1,7.75,Human Nutrition: 2020 Edition \n
1,-40,0,1,1,0.00,
2,-39,322,43,1,80.50,Human Nutrition: 2020 \nEdition \nUNIVERSITY O...
3,-38,214,31,2,53.50,Human Nutrition: 2020 Edition by University of...
4,-37,799,115,2,199.75,Contents \nPreface \nUniversity of Hawai‘i at ...
...,...,...,...,...,...,...
1203,1162,1677,217,18,419.25,39. Exercise 10.2 & 11.3 reused “Egg Oval Food...
1204,1163,1618,223,10,404.50,Images / Pixabay License; “Pumpkin Cartoon Ora...
1205,1164,1716,229,13,429.00,Flashcard Images \nNote: Most images in the fl...
1206,1165,1734,233,13,433.50,ShareAlike \n11. Organs reused “Pancreas Organ...


In [8]:
nlp = English()
nlp.add_pipe("sentencizer")

doc = nlp("This is a sentence. This another sentence.")
list(doc.sents)

[This is a sentence., This another sentence.]

In [9]:
for item in tqdm(pages_and_texts) :
    item["sentences"] = list(nlp(item["text"]).sents)

    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [11]:
df.head()

Unnamed: 0,page_numer,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-41,31,5,1,7.75,Human Nutrition: 2020 Edition \n
1,-40,0,1,1,0.0,
2,-39,322,43,1,80.5,Human Nutrition: 2020 \nEdition \nUNIVERSITY O...
3,-38,214,31,2,53.5,Human Nutrition: 2020 Edition by University of...
4,-37,799,115,2,199.75,Contents \nPreface \nUniversity of Hawai‘i at ...


In [12]:
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1208 [00:00<?, ?it/s]

In [16]:

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_numer"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

  0%|          | 0/1208 [00:00<?, ?it/s]

In [17]:
df = pd.DataFrame(pages_and_chunks)

df.head(2)

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,-41,Human Nutrition: 2020 Edition,29,4,7.25
1,-39,Human Nutrition: 2020 \nEdition \nUNIVERSITY O...,320,42,80.0


In [18]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
filt = df["chunk_token_count"] <= min_token_length

for row in df[filt].sample(5).iterrows() :
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 3.5 | Text: 190 | Chloride
Chunk token count: 16.25 | Text: PART XVII 
CHAPTER 17. FOOD SAFETY 
Chapter 17. Food Safety | 985
Chunk token count: 12.0 | Text: Polan EU, Taylor DR. (2003), 
782 | Introduction
Chunk token count: 22.0 | Text: Advanced nutrition and human metabolism. Boston, MA: 
Cengage Learning.
Molybdenum | 693
Chunk token count: 9.0 | Text: 354 | A Personal Choice about Lipids


In [19]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 \nEdition \nUNIVERSITY OF HAWAI‘I AT MĀNOA \nFOOD SCIENCE AND HUMAN \nNUTRITION PROGRAM \nALAN TITCHENAL, SKYLAR HARA, \nNOEMI ARCEO CAACBAY, WILLIAM \nMEINKE-LAU, YA-YUN YANG, MARIE \nKAINOA FIALKOWSKI REVILLA, \nJENNIFER DRAPER, GEMADY \nLANGFELDER, CHERYL GIBBY, CHYNA \nNICOLE CHUN, AND ALLISON \nCALABRESE',
  'chunk_char_count': 320,
  'chunk_word_count': 42,
  'chunk_token_count': 80.0},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and \nHuman Nutrition Program is licensed under a Creative Commons Attribution 4.0 \nInternational License, except where otherwise noted.',
  'chunk_char_count': 212,
  'chunk_word_count': 30,
  'chunk_token_count': 53.0}]

In [25]:
df.describe()


Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,1843.0,1843.0,1843.0,1843.0
mean,583.381443,749.149756,109.727075,187.287439
std,347.78867,456.156581,70.817254,114.039145
min,-41.0,12.0,3.0,3.0
25%,280.5,320.0,42.0,80.0
50%,586.0,762.0,110.0,190.5
75%,890.0,1136.5,170.0,284.125
max,1166.0,1868.0,296.0,467.0


In [12]:
import base64
from openai import OpenAI

OPEN_API_KEY = os.getenv("OPEN_API_KEY")
OPEN_API_KEY


'get-token-here-from-openai'

In [29]:
client = OpenAI(api_key=OPEN_API_KEY)

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding


In [34]:
emd = get_embedding(pages_and_chunks_over_min_token_len[0]['sentence_chunk'])
len(emd)

1536

In [35]:
for index, page_cont in enumerate(pages_and_chunks_over_min_token_len):
    emb_list = get_embedding(page_cont['sentence_chunk'])
    if isinstance(emb_list, list):
        page_cont['embedding_list'] = emb_list
        pages_and_chunks_over_min_token_len[index] = page_cont
    else:
        print("No data at index", index)

In [38]:
import sys

sys.getsizeof(pages_and_chunks_over_min_token_len)



14360

In [40]:
from uuid import uuid4

vector_pages_list = []
for index, page_cont in enumerate(pages_and_chunks_over_min_token_len):
    page_cont['id'] = uuid4().hex
    page_cont['embedding_numpy'] = np.array(page_cont['embedding_list'])
    del page_cont['embedding_list']
    vector_pages_list.append(page_cont)

In [41]:
vector_pages_list[:2]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 \nEdition \nUNIVERSITY OF HAWAI‘I AT MĀNOA \nFOOD SCIENCE AND HUMAN \nNUTRITION PROGRAM \nALAN TITCHENAL, SKYLAR HARA, \nNOEMI ARCEO CAACBAY, WILLIAM \nMEINKE-LAU, YA-YUN YANG, MARIE \nKAINOA FIALKOWSKI REVILLA, \nJENNIFER DRAPER, GEMADY \nLANGFELDER, CHERYL GIBBY, CHYNA \nNICOLE CHUN, AND ALLISON \nCALABRESE',
  'chunk_char_count': 320,
  'chunk_word_count': 42,
  'chunk_token_count': 80.0,
  'id': '1bb0eb1b87a146f8986f42cfe65bcc4f',
  'embedding_numpy': array([-0.01989893,  0.01859011,  0.0209169 , ...,  0.00927082,
         -0.01116134, -0.01302762])},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and \nHuman Nutrition Program is licensed under a Creative Commons Attribution 4.0 \nInternational License, except where otherwise noted.',
  'chunk_char_count': 212,
  'chunk_word_count': 30,
  'chunk_token_count': 53.0,
  'id': 'f4c9c0382f4c450fbf051e

In [47]:
import h5py

recreate_data = []

# Create an HDF5 file
with h5py.File('data.h5', 'w') as f:
    for i, item in enumerate(vector_pages_list):
        group = f.create_group(f'dict_{i}')
        for key, value in item.items():
            group.create_dataset(key, data=value)



In [50]:
# Verify the content of the HDF5 file
with h5py.File('data.h5', 'r') as f:
    for group_name in f:
        print(f'Group: {group_name}')
        group = f[group_name]
        # dict_grp = {key: group[key][:] for key in group}
        for key in group:
            
            print(f'  {key}: {group[key]}')
        # recreate_data.append(dict_grp)

Group: dict_0
  chunk_char_count: <HDF5 dataset "chunk_char_count": shape (), type "<i8">
  chunk_token_count: <HDF5 dataset "chunk_token_count": shape (), type "<f8">
  chunk_word_count: <HDF5 dataset "chunk_word_count": shape (), type "<i8">
  embedding_numpy: <HDF5 dataset "embedding_numpy": shape (1536,), type "<f8">
  id: <HDF5 dataset "id": shape (), type "|O">
  page_number: <HDF5 dataset "page_number": shape (), type "<i8">
  sentence_chunk: <HDF5 dataset "sentence_chunk": shape (), type "|O">
Group: dict_1
  chunk_char_count: <HDF5 dataset "chunk_char_count": shape (), type "<i8">
  chunk_token_count: <HDF5 dataset "chunk_token_count": shape (), type "<f8">
  chunk_word_count: <HDF5 dataset "chunk_word_count": shape (), type "<i8">
  embedding_numpy: <HDF5 dataset "embedding_numpy": shape (1536,), type "<f8">
  id: <HDF5 dataset "id": shape (), type "|O">
  page_number: <HDF5 dataset "page_number": shape (), type "<i8">
  sentence_chunk: <HDF5 dataset "sentence_chunk": shape (

In [52]:
import h5py
import numpy as np

# Example list of dictionaries with mixed data types

# Function to store data in HDF5 file
def save_to_hdf5(data, filename):
    with h5py.File(filename, 'w') as f:
        for i, item in enumerate(data):
            group = f.create_group(f'dict_{i}')
            for key, value in item.items():
                if isinstance(value, np.ndarray):
                    group.create_dataset(key, data=value)
                elif isinstance(value, str):
                    dt = h5py.special_dtype(vlen=str)
                    group.create_dataset(key, (1,), dtype=dt, data=value)
                elif isinstance(value, int):
                    group.create_dataset(key, (1,), dtype='i', data=value)

# Function to load data from HDF5 file
def load_from_hdf5(filename):
    data = []
    with h5py.File(filename, 'r') as f:
        for group_name in f:
            group = f[group_name]
            item = {}
            for key in group:
                value = group[key][()]
                if isinstance(value, bytes):
                    value = value.decode('utf-8')
                if group[key].shape == (1,) and group[key].dtype == 'i':
                    value = int(value)
                item[key] = value
            data.append(item)
    return data

# Save the data to HDF5
save_to_hdf5(vector_pages_list, 'mixed_data.h5')

# Load the data back from HDF5
loaded_data = load_from_hdf5('mixed_data.h5')

# Verify the content
# for item in loaded_data:
    # print(item)


  value = int(value)


In [54]:
import pickle

with open("data.pkl", "wb") as f:
    pickle.dump(vector_pages_list, f)

In [3]:
import pickle

with open("data.pkl", "rb") as f:
    reloaded_data = pickle.load(f)
    
reloaded_data[:3]

[{'page_number': -39,
  'sentence_chunk': 'Human Nutrition: 2020 \nEdition \nUNIVERSITY OF HAWAI‘I AT MĀNOA \nFOOD SCIENCE AND HUMAN \nNUTRITION PROGRAM \nALAN TITCHENAL, SKYLAR HARA, \nNOEMI ARCEO CAACBAY, WILLIAM \nMEINKE-LAU, YA-YUN YANG, MARIE \nKAINOA FIALKOWSKI REVILLA, \nJENNIFER DRAPER, GEMADY \nLANGFELDER, CHERYL GIBBY, CHYNA \nNICOLE CHUN, AND ALLISON \nCALABRESE',
  'chunk_char_count': 320,
  'chunk_word_count': 42,
  'chunk_token_count': 80.0,
  'id': '1bb0eb1b87a146f8986f42cfe65bcc4f',
  'embedding_numpy': array([-0.01989893,  0.01859011,  0.0209169 , ...,  0.00927082,
         -0.01116134, -0.01302762])},
 {'page_number': -38,
  'sentence_chunk': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and \nHuman Nutrition Program is licensed under a Creative Commons Attribution 4.0 \nInternational License, except where otherwise noted.',
  'chunk_char_count': 212,
  'chunk_word_count': 30,
  'chunk_token_count': 53.0,
  'id': 'f4c9c0382f4c450fbf051e

In [6]:
import sys
sys.getsizeof(reloaded_data)


13528

In [8]:
from pinecone.grpc import PineconeGRPC as Pinecone, ServerlessSpec


ImportError: cannot import name 'ServerlessSpec' from 'pinecone.grpc' (/home/animeshmohapatra/miniconda3/envs/sb_new/lib/python3.9/site-packages/pinecone/grpc/__init__.py)

In [4]:
# TODO: remove api key
pine_cone_api = os.getenv("PINE_CONE_API_KEY")

ba
pc = Pinecone(api_key=pine_cone_api)

index_name = "human-nutrition-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 
    
index = pc.Index(index_name)

index.upsert(
    vectors=[
        {"id": "vec1", "values": [1.0, 1.5]},
        {"id": "vec2", "values": [2.0, 1.0]},
        {"id": "vec3", "values": [0.1, 3.0]},
    ],
    namespace="ns1"
)

In [None]:
import chromadb
chroma_client = chromadb.Client()


In [None]:
collection = chroma_client.create_collection(name="human-nutrition-index")


In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM


In [None]:
AutoModelForCausalLM()