In [38]:
# load data
import os
import shutil
import PyPDF2
import spacy
from pathlib import Path

In [39]:
# embeddings
import numpy as np
import pandas as pd

In [40]:
#Load spacy English Corpus
!python -m spacy download en_core_web_sm -q
nlp = spacy.load('en_core_web_sm')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [41]:
# Initialize lists for documents, metadatas, and ids
documents = []
metadatas = []
ids = []

In [42]:
# Ensure the destination folder /pdfID exists
destination_folder = '../data/out/pdfIDs'
Path(destination_folder).mkdir(parents=True, exist_ok=True)

In [43]:
# Function to process each PDF file
def process_pdf(pdf_path, pdfID):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        list_of_pages = [] # List of pages as strings
        
        for i in range(num_pages):
            page_text = reader.pages[i].extract_text()
            list_of_pages.append(page_text)
            
            # Split page into sentences using spacy
            sentences_from_page = [sent.text for sent in nlp(page_text).sents]
            
            for sentenceID, sentence in enumerate(sentences_from_page):
                documents.append(sentence)
                metadatas.append({"pageID": str(i), "pdfID": str(pdfID)})
                ids.append(f"{pdfID}-{i}-{sentenceID}")
    
    return list_of_pages

In [44]:
# Iterate through PDF files in /sample_data and process them
source_folder = "../data/source"
pdf_files = [f for f in os.listdir(source_folder) if f.endswith('.pdf') or f.endswith('.PDF')]

for pdfID, pdf_file in enumerate(pdf_files):
    # Process each PDF
    pdf_path = os.path.join(source_folder, pdf_file)
    list_of_pages = process_pdf(pdf_path, pdfID)
    
    # Copy and rename the PDF to /pdfID folder
    destination_path = os.path.join(destination_folder, f"{pdfID}.pdf")
    shutil.copy(pdf_path, destination_path)

In [45]:
# Create a dataframe
df = pd.DataFrame({"id": ids, "text": documents, "metadata": metadatas})
df.to_csv('../data/out/pdf_data.csv', index=False)
df.head(50)


Unnamed: 0,id,text,metadata
0,0-0-0,IN THE HIGH COURT OF KERALA AT ERNAKULAM\nPRES...,"{'pageID': '0', 'pdfID': '0'}"
1,0-0-1,"NO. 970 OF 2022\nPETITIONERS:\n1DR.VIJIL,\nAGE...","{'pageID': '0', 'pdfID': '0'}"
2,0-0-2,"2DR.SONIA, \nSM HOSPITAL, THANA, \nKANNUR, PIN...","{'pageID': '0', 'pdfID': '0'}"
3,0-0-3,"3DR.B.V.BHAT, M/S. ASHOKA HOSPITAL, \nSOUTH BA...","{'pageID': '0', 'pdfID': '0'}"
4,0-0-4,"4DR.ASHOK RAJ, \nJYOTHIS HOSPITAL, PALLIKUNNU,...","{'pageID': '0', 'pdfID': '0'}"
5,0-0-5,"6DR.SUCHITHARA BHAT,\nASHOKA HOSPITAL, SOUTH B...","{'pageID': '0', 'pdfID': '0'}"
6,0-0-6,BY ADVS.\nS.GOPAKUMARAN NAIR (SR.)\n,"{'pageID': '0', 'pdfID': '0'}"
7,0-0-7,SOORAJ T.ELENJICKAL\nRENOY VINCENT\nARUN ROY\n...,"{'pageID': '0', 'pdfID': '0'}"
8,0-1-0,W.P.(C) No.970/2022\n: 2 :\nRESPONDENTS:\n,"{'pageID': '1', 'pdfID': '0'}"
9,0-1-1,"1AMBUJAKSHI .T.P.,\nW/O.JANARDHANAN, CHANDROTH...","{'pageID': '1', 'pdfID': '0'}"


EMBEDDINGS

In [46]:
import chromadb
from chromadb.config import Settings

# Create a new persistent client
client = chromadb.PersistentClient(path="../embeddings/example", settings=Settings(allow_reset=True))

In [47]:
# CAUTION CAUTION CAUTION Empties and completely resets the database. ⚠️ This is destructive and not reversible.
# client.reset() # Reset the database
# CAUTION CAUTION CAUTION Empties and completely resets the database. ⚠️ This is destructive and not reversible.

In [48]:
# Create a vector database
exampleDB = client.get_or_create_collection('exampleDB')

OperationalError: attempt to write a readonly database

In [None]:
# exampleDB.add(
#     documents=documents,
#     metadatas=metadatas,
#     ids=ids
# )

In [None]:
results = exampleDB.query(
    query_texts=["Viction went through hell"],
    n_results=5
)

results

{'ids': [['0-3-1', '0-3-3', '0-7-1', '0-3-4', '0-10-7']],
 'distances': [[1.3381223627503263,
   1.5288106663427716,
   1.5588837089660148,
   1.6004951168959924,
   1.620079943962757]],
 'metadatas': [[{'pageID': '3', 'pdfID': '0'},
   {'pageID': '3', 'pdfID': '0'},
   {'pageID': '7', 'pdfID': '0'},
   {'pageID': '3', 'pdfID': '0'},
   {'pageID': '10', 'pdfID': '0'}]],
 'embeddings': None,
 'documents': [['There was no relief.',
   'After the treatment by the opposite\nparties, the complainant lost the sight of her left eye.',
   'The  question  whether  medical  negligence/\ndeficiency in medical services would fall within the ambit of\n‘service’ came up for consideration before the Hon’ble Apex\nCourt, in V.P. Shantha  (supra). \n',
   'The\ncomplainant alleged that loss of eye sight was due to medical\nnegligence and sought for a compensation of ₹32,52,000/-. \n',
   'The words “Medical Service” were not expressly']],
 'uris': None,
 'data': None}