In [1]:
import pdfplumber
import re

In [2]:
pdf_path = "/home/aditya/Desktop/criminal-code-nepal.pdf"

In [3]:
with pdfplumber.open(pdf_path) as pdf:
    full_text = ""

    for page_num in range(len(pdf.pages)):
        page = pdf.pages[page_num]
        full_text += page.extract_text() + "\n"

In [4]:
print(full_text[:1000])  

Revised
The National Penal (Code) Act, 2017
Date of Authentication:
16 October 2017
Act number 36 of the year 2017
An Act Made To Amend And Consolidate Laws In Force Relating To
Criminal Offences
Preamble:
Whereas, it is expedient to provide for a timely code on criminal
offences, by amending and consolidating the laws in force relating to
criminal offences, in order to uphold morality, decency, etiquette,
convenience, economic interest of the public, by maintaining law and order
in the country, maintain harmonious relationship and peace among various
religious and cultural communities, and prevent and control criminal
offences;
Now, therefore, the Legislature-Parliament referred to in clause (1)
of Article 296 of the Constitution of Nepal has enacted this Act.
Part -1
General Provisions
Chapter-1
Preliminary
1. Short title and commencement: (1) This Act may be cited as the
"National Penal (Code) Act, 2017".
1
(2) It shall commence on 17 August 2018 (first day of the month
of Bhadra of

In [4]:
metadata_pattern = r"^(.*?)(?=\bPreamble:)"
metadata = re.findall(metadata_pattern, full_text, re.DOTALL)
metadata 

['Revised\nThe National Penal (Code) Act, 2017\nDate of Authentication:\n16 October 2017\nAct number 36 of the year 2017\nAn Act Made To Amend And Consolidate Laws In Force Relating To\nCriminal Offences\n']

In [5]:
preamble_pattern = r"(Preamble:.*?)(?=Part -1|Chapter-\d+|$)"
preamble = re.findall(preamble_pattern, full_text, re.DOTALL)
preamble

['Preamble:\nWhereas, it is expedient to provide for a timely code on criminal\noffences, by amending and consolidating the laws in force relating to\ncriminal offences, in order to uphold morality, decency, etiquette,\nconvenience, economic interest of the public, by maintaining law and order\nin the country, maintain harmonious relationship and peace among various\nreligious and cultural communities, and prevent and control criminal\noffences;\nNow, therefore, the Legislature-Parliament referred to in clause (1)\nof Article 296 of the Constitution of Nepal has enacted this Act.\n']

In [37]:
# part_pattern = r"Part\s*-\d+\s*(.*?)(?=Part\s*-\d+|$)"
part_pattern = r"Part\s*-\s*\d+\s*(.*?)(?=Part\s*-\s*\d+|$)"

parts = re.findall(part_pattern, full_text, re.DOTALL)

len(parts)

3

In [38]:
parts

['General Provisions\nChapter-1\nPreliminary\n1. Short title and commencement: (1) This Act may be cited as the\n"National Penal (Code) Act, 2017".\n1\n(2) It shall commence on 17 August 2018 (first day of the month\nof Bhadra of the year 2075).\n2. Extra-territorial application of the Act: Any person who commits any\nof the following offences outside Nepal shall be punished under this Act\nas if he or she committed such offence in Nepal:\n(a) Any offence under Chapter 1 of Part 2 except Section\n54,\n(b) Any offence under Chapter 17 of Part 2 committed by\nkidnapping any person from Nepal,\n(c) Any offence under Sections 167, 276 and 279,\n(d) Any offence under Chapters 22 and 23 of Part 2\ncommitted with intent to bring into use in or import into\nNepal,\n(e) Any offence under this Act committed in the course of\ndischarging duties of any position in the Government of\nNepal, State Government or Local Level or a body\ncorporate under full or majority ownership or control of\nthe Gove

In [30]:
# Dictionary to store parts and their chapters
part_chapters = {}

# chapter_pattern = r"^(Chapter-\s?\d+)(.*?)(?=(\nChapter-\s?\d+|\n$))"
chapter_pattern = r"^(Chapter\s*-?\s?\d+)(.*?)(?=\nChapter\s*-?\s?\d+|\Z)"

for idx, part in enumerate(parts, start=1):
    chapters = re.findall(chapter_pattern, part, re.MULTILINE | re.DOTALL)
    part_chapters[f"Part-{idx}"] = {chapter[0]: chapter[1].strip() for chapter in chapters}

In [31]:
part_chapters

{'Part-1': {'Chapter-1': 'Preliminary\n1. Short title and commencement: (1) This Act may be cited as the\n"National Penal (Code) Act, 2017".\n1\n(2) It shall commence on 17 August 2018 (first day of the month\nof Bhadra of the year 2075).\n2. Extra-territorial application of the Act: Any person who commits any\nof the following offences outside Nepal shall be punished under this Act\nas if he or she committed such offence in Nepal:\n(a) Any offence under Chapter 1 of Part 2 except Section\n54,\n(b) Any offence under Chapter 17 of Part 2 committed by\nkidnapping any person from Nepal,\n(c) Any offence under Sections 167, 276 and 279,\n(d) Any offence under Chapters 22 and 23 of Part 2\ncommitted with intent to bring into use in or import into\nNepal,\n(e) Any offence under this Act committed in the course of\ndischarging duties of any position in the Government of\nNepal, State Government or Local Level or a body\ncorporate under full or majority ownership or control of\nthe Government 

In [25]:
len(part_chapters['Part-3'])

1

In [39]:
# Chunk chapter content using LangChain's RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 1000
chunk_overlap = 100
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

chunked_part_chapter_dict = {}
for part, chapters in part_chapters.items():
    chunked_part_chapter_dict[part] = {}
    for chapter, content in chapters.items():
        # Split content into chunks
        chunks = splitter.split_text(content)
        chunked_part_chapter_dict[part][chapter] = chunks

chunked_part_chapter_dict

{'Part-1': {'Chapter-1': ['Preliminary\n1. Short title and commencement: (1) This Act may be cited as the\n"National Penal (Code) Act, 2017".\n1\n(2) It shall commence on 17 August 2018 (first day of the month\nof Bhadra of the year 2075).\n2. Extra-territorial application of the Act: Any person who commits any\nof the following offences outside Nepal shall be punished under this Act\nas if he or she committed such offence in Nepal:\n(a) Any offence under Chapter 1 of Part 2 except Section\n54,\n(b) Any offence under Chapter 17 of Part 2 committed by\nkidnapping any person from Nepal,\n(c) Any offence under Sections 167, 276 and 279,\n(d) Any offence under Chapters 22 and 23 of Part 2\ncommitted with intent to bring into use in or import into\nNepal,\n(e) Any offence under this Act committed in the course of\ndischarging duties of any position in the Government of\nNepal, State Government or Local Level or a body\ncorporate under full or majority ownership or control of\nthe Government

1