In [11]:
# load data
import os
import shutil
import PyPDF2
import spacy
from pathlib import Path

In [12]:
# embeddings
import numpy as np
import pandas as pd

In [13]:
#Load spacy English Corpus
!python -m spacy download en_core_web_sm -q
nlp = spacy.load('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [14]:
# Initialize lists for documents, metadatas, and ids
documents = []
metadatas = []
ids = []

In [15]:
# Ensure the destination folder /pdfID exists
destination_folder = '../data/out/pdfIDs'
Path(destination_folder).mkdir(parents=True, exist_ok=True)

In [16]:
# Function to process each PDF file
def process_pdf(pdf_path, pdfID):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        list_of_pages = [] # List of pages as strings
        
        for i in range(num_pages):
            page_text = reader.pages[i].extract_text()
            list_of_pages.append(page_text)
            
            # Split page into sentences using spacy
            sentences_from_page = [sent.text for sent in nlp(page_text).sents]
            
            for sentenceID, sentence in enumerate(sentences_from_page):
                documents.append(sentence)
                metadatas.append({"pageID": str(i), "pdfID": str(pdfID)})
                ids.append(f"{pdfID}-{i}-{sentenceID}")
    
    return list_of_pages

In [17]:
# Iterate through PDF files in /sample_data and process them
source_folder = "../data/source"
pdf_files = [f for f in os.listdir(source_folder) if f.endswith('.pdf') or f.endswith('.PDF')]

for pdfID, pdf_file in enumerate(pdf_files):
    # Process each PDF
    pdf_path = os.path.join(source_folder, pdf_file)
    list_of_pages = process_pdf(pdf_path, pdfID)
    
    # Copy and rename the PDF to /pdfID folder
    destination_path = os.path.join(destination_folder, f"{pdfID}.pdf")
    shutil.copy(pdf_path, destination_path)

In [18]:
# Create a dataframe
df = pd.DataFrame({"id": ids, "text": documents, "metadata": metadatas})
df.to_csv('../data/out/pdf_data.csv', index=False)
df.head(50)


Unnamed: 0,id,text,metadata
0,0-0-0,Allauddin Mian & Ors.,"{'pageID': '0', 'pdfID': '0'}"
1,0-0-1,Sharif Mian & Anr vs State Of Bihar on 13\nApr...,"{'pageID': '0', 'pdfID': '0'}"
2,0-0-2,A.M. Ahmadi\n PETITIONER:\nALLAUDDIN...,"{'pageID': '0', 'pdfID': '0'}"
3,0-0-3,SHARIF MIAN & ANR.\n,"{'pageID': '0', 'pdfID': '0'}"
4,0-0-4,Vs,"{'pageID': '0', 'pdfID': '0'}"
5,0-0-5,.\nRESPONDENT:\n,"{'pageID': '0', 'pdfID': '0'}"
6,0-0-6,STATE OF BIHAR\nDATE OF JUDGMENT13/04/1989\nBE...,"{'pageID': '0', 'pdfID': '0'}"
7,0-0-7,Section 235 and\nsection 354(3)--Sentenc...,"{'pageID': '0', 'pdfID': '0'}"
8,0-0-8,"Indian Penal Code, 1860:","{'pageID': '0', 'pdfID': '0'}"
9,0-0-9,"Sections 34, 141, 149--Unlawful\nassembly--Fas...","{'pageID': '0', 'pdfID': '0'}"
