In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from glob import glob
import os
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from dotenv import load_dotenv
import shutil
import gc
import time

In [29]:
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [2]:
load_dotenv(dotenv_path=r"C:\Users\rajme\Documents\LLMs_\.env", override=True)

True

In [3]:
chroma_dir = "../chrome_store"

if os.path.exists(chroma_dir):
    shutil.rmtree(chroma_dir)

In [5]:
loader = PyPDFLoader(r"../books/vol1.pdf")
pages = loader.load_and_split()

In [6]:
for page in pages:
    page.metadata["producer"] = "Rajit"
    page.metadata["creator"] = "Rajit"

In [7]:
embedding_model = OpenAIEmbeddings()

In [10]:
page_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
vectorstore = Chroma(persist_directory=chroma_dir, embedding_function=embedding_model)

In [11]:
chunks = page_splitter.split_documents(pages)

In [12]:
vectorstore.add_documents(chunks)

['2093c08c-5b4d-4bb8-bacb-427b8018e733',
 '32f2375e-9949-4bb5-abac-2cf65823bb43',
 '0b3dcb43-a079-4df5-999f-8636ad79c782',
 '23622d7f-c0e9-4ded-b464-98a5d0ac7e80',
 'b42bcd11-ef43-41fb-86bb-22edef1fdc03',
 'd456598f-2cde-4255-a8e6-a39b9107c606',
 '7d33ae81-c00d-4d99-b7f6-c62470d1263a',
 'd25ed910-0737-45f1-93ee-6bca363891d5',
 'f00bb9b4-b629-48a1-bf47-87098195d413',
 '7b2db008-07eb-4538-ac91-1f529cb839bd',
 '776a3872-af20-43ff-88bb-3040ded1e9fa',
 '4d9cda51-c7cb-4360-b0cd-df14ad3f63a4',
 '66f43146-62ee-4ffd-9dca-aabae1c80ce8',
 'ae4b5af6-d4d6-403a-9f69-69118aff827b',
 'd63234f7-5cd7-4483-991e-6cf809e4ae8c',
 '01d5e25b-654b-4e5f-b85a-e3b0b727e1f7',
 '2c3a6a26-f433-49e3-8768-ffd93ad61d20',
 '405efc09-c523-4b04-91ef-077298b1ebb5',
 '1feb33ca-eb79-4a26-b788-8b63f6dc55eb',
 '54298b81-4894-482d-bd14-7bb6568c0f2a',
 'e6362735-0a7b-4221-a02f-a4303dda7182',
 '5bd34831-85a4-4acc-a6b5-6fee6d66f861',
 '2c95639c-9d41-458e-b1f8-a1bc53db2683',
 'db85b48b-afd0-4abb-85ff-2fbed4750485',
 '100959b1-69e0-

In [13]:
collection = vectorstore._collection

In [24]:
sample_embedding = collection.get(limit=100, include=["embeddings"])

In [27]:
vectors = sample_embedding["embeddings"]

In [30]:
type(vectors)

numpy.ndarray

In [31]:
vectors.shape

(100, 1536)

In [32]:
vectors[0]

array([ 0.00531599, -0.02999113, -0.02362815, ...,  0.00450205,
        0.0083016 , -0.02137206])