In [53]:
import chromadb
from typing import List, Dict, Any, Tuple
import os
from pypdf import PdfReader
import uuid

In [7]:
client = chromadb.PersistentClient("./chromaDB")
print(client)

<chromadb.api.client.Client object at 0x0000017263E80620>


In [9]:
collection = client.get_or_create_collection(name="test")
print(collection)

Collection(name=test)


In [11]:
print(collection.count())

0


In [13]:
collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents', 'embeddings'],
 'data': None,
 'metadatas': []}

In [43]:
def load_text_files(root_dir: str) -> List[Tuple[str, str]]:
    docs = []
    for dirpath, _, filenames in os.walk(root_dir):
        for f in filenames:
            if f.endswith((".pdf")):
                reader = PdfReader(os.path.join(dirpath, f)) 
                num_pages = len(reader.pages)
                print(f"Number of pages: {num_pages}")
                full_text = ""
                for page_num in range(num_pages):
                    page = reader.pages[page_num]
                    full_text += page.extract_text()

                print("Full Text: ", full_text)
                print("****************")
                docs.append((os.path.join(dirpath, f), full_text))
        
        
    return docs

docs = load_text_files("local_docs")
print(docs)
print(len(docs))

Number of pages: 1
Full Text:  MAHANATI.  
 
Mahanati is one of the best movies in Indian cinema which truly depicts the ups and downs of 
Savithri amma's life which we all could connect to because we are also going through same 
emotions. 
It was directed by Nag Ashwin and produced my Vyjyanthi, Swapna movies by 2 daring and dashing 
sisters – Swapna, Priyanka Dutt. Music done by Mickey J Meyer, till-date, remains his best work and 
one couldn’t even imagine that he did this movie. Just a soothing, heart-touching songs and 
background score by this man. Just when you realize that music alone touches your heart and soul, 
there comes another man with his lyrics to make it even more worser – Sirivennala Seetharama 
Sastry – with Gelupuleni Samaram song. “naranaralona visham ayinidi prema”, SUMS IT ALL – PEAK 
CINEMA.   
One couldn’t end this without mentioning about Keerthi suresh, she literally lived in Savithri amma 
role and no better than anyone could portray her, sorry – no better 

In [47]:
texts = [t for _, t in docs]
print(texts)

["MAHANATI.  \n \nMahanati is one of the best movies in Indian cinema which truly depicts the ups and downs of \nSavithri amma's life which we all could connect to because we are also going through same \nemotions. \nIt was directed by Nag Ashwin and produced my Vyjyanthi, Swapna movies by 2 daring and dashing \nsisters – Swapna, Priyanka Dutt. Music done by Mickey J Meyer, till-date, remains his best work and \none couldn’t even imagine that he did this movie. Just a soothing, heart-touching songs and \nbackground score by this man. Just when you realize that music alone touches your heart and soul, \nthere comes another man with his lyrics to make it even more worser – Sirivennala Seetharama \nSastry – with Gelupuleni Samaram song. “naranaralona visham ayinidi prema”, SUMS IT ALL – PEAK \nCINEMA.   \nOne couldn’t end this without mentioning about Keerthi suresh, she literally lived in Savithri amma \nrole and no better than anyone could portray her, sorry – no better than anyone coul

In [49]:
metadatas = [{"path": p} for p, _ in docs]
metadatas

[{'path': 'local_docs\\Mahanati.pdf'},
 {'path': 'local_docs\\SagaraSangamam.pdf'},
 {'path': 'local_docs\\VaaranamAayiram.pdf'}]

In [55]:
ids = [str(uuid.uuid4()) for _ in docs]
ids

['815b14d6-1dae-4a50-847b-3159c605b9ad',
 '50824fca-9204-41b3-9abc-fd9c8c1d2b54',
 'dbc95660-2252-4245-a1de-c28633fa3557']

In [57]:
collection.add(
        documents=texts,
  metadatas=metadatas,
  ids=ids
 )

C:\Users\amrutha sai\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|█| 79.3M/79.3M [00:57<00:00, 1.44MiB/


In [59]:
collection.peek()

{'ids': ['815b14d6-1dae-4a50-847b-3159c605b9ad',
  '50824fca-9204-41b3-9abc-fd9c8c1d2b54',
  'dbc95660-2252-4245-a1de-c28633fa3557'],
 'embeddings': array([[-0.01306105, -0.00560636, -0.05814646, ..., -0.07089222,
          0.01127236, -0.02155333],
        [-0.02295787,  0.0407757 , -0.00211664, ..., -0.01304192,
         -0.0267619 ,  0.02111504],
        [-0.03030932, -0.02060116, -0.06263233, ..., -0.05818316,
          0.02634144,  0.04189683]]),
 'documents': ["MAHANATI.  \n \nMahanati is one of the best movies in Indian cinema which truly depicts the ups and downs of \nSavithri amma's life which we all could connect to because we are also going through same \nemotions. \nIt was directed by Nag Ashwin and produced my Vyjyanthi, Swapna movies by 2 daring and dashing \nsisters – Swapna, Priyanka Dutt. Music done by Mickey J Meyer, till-date, remains his best work and \none couldn’t even imagine that he did this movie. Just a soothing, heart-touching songs and \nbackground score by 

In [79]:
print(collection.count())

3


In [71]:
results = collection.get(include=['embeddings'])
print(results)

{'ids': ['815b14d6-1dae-4a50-847b-3159c605b9ad', '50824fca-9204-41b3-9abc-fd9c8c1d2b54', 'dbc95660-2252-4245-a1de-c28633fa3557'], 'embeddings': array([[-0.01306105, -0.00560636, -0.05814646, ..., -0.07089222,
         0.01127236, -0.02155333],
       [-0.02295787,  0.0407757 , -0.00211664, ..., -0.01304192,
        -0.0267619 ,  0.02111504],
       [-0.03030932, -0.02060116, -0.06263233, ..., -0.05818316,
         0.02634144,  0.04189683]]), 'documents': None, 'uris': None, 'included': ['embeddings'], 'data': None, 'metadatas': None}


In [77]:
print(len(results['embeddings'][0]))

384


In [91]:
results = collection.query(
    query_texts=[
        "What is my favorite all time movie?",
        "What is GOAT movie?"
    ],
    n_results=1,
    include =['metadatas', 'documents', 'embeddings','distances'],
)

print(results['documents'])
print(results['distances'])

[['I also like SagaraSangamam by Vishwanath, which truly depicts of the story of a loser. Same as in \nMahanti, this too depicts the ups-downs of life. But mahanati tops for me as favourite movie. '], ['I think, Vaaranam Aayiram deserves the place here in third position. Again, its also slice of coming-\nage movie depicting life. ']]
[[1.1203243732452393], [1.5175092220306396]]
