In [1]:
from pathlib import Path
from loguru import logger
from typing import List
from tqdm import tqdm
from llama_index.core import SimpleDirectoryReader, Document

DATA_DIR = Path("D:\\NIPL2093\\work\\long_doc_summarization\\data\\")
TEST_FILE = "37657_2017_1_1501_39247_Judgement_19-Oct-2022_149pg.pdf"
test_file_path = DATA_DIR / TEST_FILE

In [2]:
def read_pdf_file(file_path: str | Path, use_llama_parse: bool = False) -> List[Document]:
    """
    Read a single PDF file and return its content as a list of Document objects.

    Args:
    file_path (str): Path to the PDF file.
    use_llama_parse (bool): Whether to use LlamaParse for parsing.

    Returns:
    List[Document]: List of Document objects containing the PDF content.
    """
    if use_llama_parse:
        parser = LlamaParse(result_type="markdown")
        file_extractor = {".pdf": parser}
    logger.info(f"Attempting to read PDF file: {file_path}")

    try:
        if use_llama_parse:
            documents = SimpleDirectoryReader(
                input_files=[file_path], file_extractor=file_extractor
            ).load_data()
        else:
            documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
        
        if documents:
            # Apply preprocessing to each document
            # for i, doc in tqdm(enumerate(documents), desc="Pre-processing documents"):
            #     documents[i].text = preprocess_citations(doc.text)
            
            logger.success(f"Successfully read and preprocessed PDF file: {file_path}")
            logger.debug(f"Number of documents: {len(documents)}")
            logger.debug(f"First document sample: {documents[0].text[:100]}")

            # Save the markdown content to a file
            output_dir = DATA_DIR / "converted_markdown"
            output_dir.mkdir(parents=True, exist_ok=True)
            output_file = output_dir / f"{Path(file_path).stem}.md"

            with open(output_file, "w", encoding="utf-8") as f:
                for doc in documents:
                    f.write(doc.text + "\n\n")

            logger.info(f"Saved markdown content to: {output_file}")
            return documents
        else:
            logger.warning(f"PDF file is empty: {file_path}")
            return []
    except Exception as e:
        logger.error(f"Error reading PDF file {file_path}: {str(e)}")
        return []

In [3]:
documents: List[Document] = read_pdf_file(str(test_file_path))

[32m2024-10-24 14:17:08.804[0m | [1mINFO    [0m | [36m__main__[0m:[36mread_pdf_file[0m:[36m15[0m - [1mAttempting to read PDF file: D:\NIPL2093\work\long_doc_summarization\data\37657_2017_1_1501_39247_Judgement_19-Oct-2022_149pg.pdf[0m
[32m2024-10-24 14:17:12.236[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mread_pdf_file[0m:[36m30[0m - [32m[1mSuccessfully read and preprocessed PDF file: D:\NIPL2093\work\long_doc_summarization\data\37657_2017_1_1501_39247_Judgement_19-Oct-2022_149pg.pdf[0m
[32m2024-10-24 14:17:12.236[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mread_pdf_file[0m:[36m31[0m - [34m[1mNumber of documents: 149[0m
[32m2024-10-24 14:17:12.236[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mread_pdf_file[0m:[36m32[0m - [34m[1mFirst document sample:  1 
REPORTABLE  
 
IN THE SUPREME COURT OF INDIA  
     CIVIL APPELLATE JURISDICTION  
 
CIVIL APPEA[0m
[32m2024-10-24 14:17:12.253[0m | [1mINFO    [0m | [36m__main__[0m:[36m

In [12]:
documents[0].text

' 1 \nREPORTABLE  \n \nIN THE SUPREME COURT OF INDIA  \n     CIVIL APPELLATE JURISDICTION  \n \nCIVIL APPEAL NO. 21762 OF 2017  \n \nASSISTANT COMMISSIONER OF  \nINCOME TAX (EXEMPTIONS)            APPELLANT(S)  \n \nVERSUS  \n \nAHMEDABAD URBAN DEVELOPMENT  \nAUTHORITY                   RESPONDENT(S)  \n \nWITH  \n \nC.A. No. 8193/2012; C.A. No. 5057/2012; C.A. No. 5058/2014; C.A. No. 9974/2018; C.A. No. \n5056/2012; C.A. No. 4196/2015; C.A. No. 4374/2015; C.A. No. 9380/2017; C.A. No. \n13071/2017; C.A. No. 12058/2017; C.A. No. 16375/2017; C.A. No. 12869/2017; C.A. No. \n17527/2017; C.A. No. 21845/2017; C.A. No. 5719/2018; C.A. No. 9886/2018; C.A. No. \n9200/2018; C.A. No. 9860/2018; C.A. No. 10114/2018; C.A. No. 1643/2019; C.A. No. \n3596/2018; C.A. No. 6762/2018; C.A. No. 3972/2018; C.A. No. 3343/2018; C.A. No. 3359/2018; \nC.A. No. 3971/2018; C.A. No. 3347/2018; C.A. No. 6489/2018; C.A. No. 10598/2018; C.A. No. \n7643/2018; C.A. No. 8321/2018; C.A. No. 8554/2018; C.A. No. 9172/2018;

In [6]:
documents[3].text

' 4 \n1. Leave grante d in all matters where leave has not already been granted.  \nC.A. No. 21762/2017 ( Assistant Commission of Income Tax, Exemptions v. \nAhmedabad Urban Development Authority ) is taken as the lead matter .  \n2.  Religious and charitable trusts have existed in one form or the other, tracing \ntheir origins to the instinct of benevolence, which is part of human nature.  Indian \nphilanthropy has enriched its cultural heritage, particu larly in catering to the \neducational, medical, socio -economic, and religious needs of the people. Here its \nrole has been supplementary to the efforts of the State, which has recognized the \npublic utility of this impulse, and granted tax exemptions.  Indian income -tax laws \nhave favoured charities, even granted preferential treatment since 1886. The law, \nwhile granting exemption to income from religious and charitable  trusts has taken \neffective measures to minimise misuse of trust funds. As a result, a \ncharita ble trust

In [8]:
documents[4].text

' 5 \n2(15)2 of the IT Act  introduced by amendment w.e.f. 01.04.2009. It is necessary, \nat this stage, to notice that the IT Act visualized three kinds of charitable \npurposes: medical relief, education, and relief for the poor – which are described \nhereafter as “per se purposes” . To this list, Parliament has, by amendments, \nadded other categories, such as preservation of environment (including \nwatersheds, forests, and wildlife) and preservation of monuments or places or \nobjects of artistic or historic interest, and yoga. The last – or the residual purpose \nincluded by the definition - is “advancement of any other object of general public \nutility”  (hereafter referred to as  “GPU category”), which is the subject of \ninterpretation in the present case.  \n5. The Director General of Income Tax f or exemptions, Commissioner of \nIncome Tax (“CIT”) in various states, and other officials of the Income tax \ndepartment (hereafter compendiously referred to as “the revenue”) ha

In [9]:
documents[123].text

' 124 \nthese state cricket associations with the Board of Cricket Control of India (BCCI), \nthe amounts received by these associations from BCCI were in the nature of \nconsideration or fees, for granting media right s, and collecting their share, among \nother things. This amounted to a business or commercial activity. It would, in this \ncontext, be useful to quote the observations set out in the ITAT’s order (which \nwere part of the commissioner’s order). The Commissione r had taken note of \nassessment proceedings in relation to BCCI, and set out its submissions:  \n“9.7.2 The AO of BCCI, based on the communication of DIT(E), Mumbai, has \nnot granted benefit of section 11 & 12 of the Act to BCCI. The stand taken by \nBCCI during its assessment proceedings is mentioned below. The BCCI vide \nits submission dated 03/12/2012 to the AO has explained its relationship with \nState Cricket Association as follows: - \n"1. BCCI is society registered under the Tamil Nadu Societies Regist

In [10]:
test_file_path_2 = DATA_DIR / "51059_2023_1_1502_56228_Judgement_03-Oct-2024_148pg.pdf"
documents_2 = read_pdf_file(test_file_path_2)

[32m2024-10-24 14:33:55.838[0m | [1mINFO    [0m | [36m__main__[0m:[36mread_pdf_file[0m:[36m15[0m - [1mAttempting to read PDF file: D:\NIPL2093\work\long_doc_summarization\data\51059_2023_1_1502_56228_Judgement_03-Oct-2024_148pg.pdf[0m
[32m2024-10-24 14:33:57.843[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mread_pdf_file[0m:[36m30[0m - [32m[1mSuccessfully read and preprocessed PDF file: D:\NIPL2093\work\long_doc_summarization\data\51059_2023_1_1502_56228_Judgement_03-Oct-2024_148pg.pdf[0m
[32m2024-10-24 14:33:57.844[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mread_pdf_file[0m:[36m31[0m - [34m[1mNumber of documents: 148[0m
[32m2024-10-24 14:33:57.844[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mread_pdf_file[0m:[36m32[0m - [34m[1mFirst document sample: 2024 INSC 753
1 
 Reportable  
IN THE SUPREME COURT OF INDIA  
CIVIL ORIGINAL JURISDICTION  
 
 
Wri[0m
[32m2024-10-24 14:33:57.851[0m | [1mINFO    [0m | [36m__main__[0m:[36m

In [11]:
documents_2[3].text

'PART I & II  \n4 \n I.  The Writ Petition  \n1 The petitioner, Sukanya Shantha, a journalist,  wrote an article “From \nSegregation to Labour, Manu’s Caste Law Governs the Indian Prison System”, which \nwas published on 10 December 2020. The article highlighted caste- based \ndiscrimination in the prisons in the country. The petitioner has sought directions for \nrepeal of the offending provisions in State prison manuals.  By an order dated 10 July \n2024, judgment was reserved.  We have heard a broad diversity of viewpoints from \nacross India.  Besides counsel for the petitioner and the intervenor, the Additional \nSolicitor General (ASG) of India  appeared for the Union of India. T he States of \nJharkhand, Uttar Pradesh, West Bengal, Maharashtra, Orissa, Karnataka, Andhra Pradesh,  and Tamil Nadu appeared through counsel.  \n \nII. Submissions  \n2 Dr. S. Muralidhar, Senior Advocate, appearing for the petitioner highlighted the \nissue of caste -based discrimination in the prisons

In [13]:
documents_2[77].text

'PART XII  \n78 \n 128 The Court in State of Madhya Pradesh v. Ram Krishna Balothia216 held that \nthe offences under PoA Act “constitute a separate class and cannot be compared with \noffences under the Penal Code”. These offences are “committed to humiliate and subjugate members of Scheduled Castes and Scheduled Tribes with a view to keeping them in a state of servitude”, and “prevent them from leading a life of dignity and self -\nrespect”. The Court quoted the Statement of Objects and Reasons of the Act to highlight that “when members of the Scheduled Castes and Scheduled Tribes assert their rig hts and demand statutory protection, vested interests try to cow them down \nand terrorise them” if they are on anticipatory bail. For this reason, the Court dismissed a challenge to Section 18 of the PoA Act, which debarred the opportunity to seek anticipat ory bail in respect of offences committed under the Act.  \n129 In Safai Karamchari Andolan v. Union of India ,\n217 the Court noted t

In [14]:
documents_2[9].text

"PART IV  \n10 \n “India’s founding fathers and mothers established in \nthe Constitution both the nation's ideals and the \ninstitutions and processes for achieving them. The \nideals were national unity and integrity and a democratic and equitable society. The new society was to be achieved through a social -economic \nrevolution pursued with a democratic spirit using constitutional, democratic institutions. I later came to \nthink of unity, social revolution, and democracy as \nthree strands of a seamless web. The founders \nbelieved that none of these goals was to be pursued, nor could any be achieved, separately . They were \nmutually dependent and had to be sought \ntogether.”\n20 \nMarc Galanter noted in this regard:  \n“Independent India embraced equality as a cardinal \nvalue against a background of elaborate, valued and \nclearly perceived inequalities. Her constitutional \npolicies to offset these proceeded from an awareness of the entrenched and cumulative nature \nof group