### Use pdfplumber to convert pdf to text

In [1]:
import pdfplumber
import re

document = ''
with pdfplumber.open(r'A2014_40.pdf') as pdf:
    list_of_pages = pdf.pages
    for i in range(len(list_of_pages)):
        current_page = pdf.pages[i].extract_text().strip()
        current_page = "\n".join(current_page.split("\n")[0:-1])    #Remove last line which is the page number
        current_page = current_page.strip() + '\n\n'
        document += current_page
document = document.encode("ascii", "ignore").decode()
document = re.sub(' +', ' ', document).strip()

In [2]:
print(document)

THE NATIONAL JUDICIAL APPOINTMENTS COMMISSION ACT, 2014 
_________ 
ARRANGEMENT OF SECTIONS 
_________ 
SECTIONS 
1. Short title and commencement. 
2. Definitions. 
3. Headquarters of Commission. 
4. Reference to Commission for filling up of vacancies. 
5. Procedure for selection of Judge of Supreme Court. 
6. Procedure for selection of Judge of High Court. 
7. Power of President to require reconsideration. 
8. Officers and employees of Commission. 
9. Procedure for transfer of Judges. 
10. Procedure to be followed by Commission in discharge of its functions. 
11. Power to make rules. 
12. Power to make regulations. 
13. Rules and regulations to be laid before Parliament. 
14. Power to remove difficulties.

THE NATIONAL JUDICIAL APPOINTMENTS COMMISSION ACT, 2014 
ACT NO. 40 OF 2014 
[31st December, 2014.] 
An Act to regulate the procedure to be followed by the National Judicial Appointments 
Commission for recommending persons for appointment as the Chief Justice of India and 
other Ju

### Split the document by newline and create a List

In [3]:
Lines = document.split('\n')

for i, line in enumerate(Lines):
    Lines[i] = line.strip()

### Extract Act ID and Act Title

In [4]:
dict = {}


#Act ID
for line in Lines:
    if re.match("^ACT\s+NO.*", line.strip()):
        dict["Act ID"] = line.strip()
        break


#Act Title
str1 = ''
for line in Lines:
    if re.match("^_+$", line.strip()):
        str1 = str1.strip()
        break
    str1 += line.strip() + ' '
dict["Act Title"] = str1

In [5]:
dict

{'Act ID': 'ACT NO. 40 OF 2014',
 'Act Title': 'THE NATIONAL JUDICIAL APPOINTMENTS COMMISSION ACT, 2014'}

### Extract Act Definition

In [6]:
#Act Definition
str2 = ''
flag = False
for line in Lines:
    if re.match("^an\s+act\s+to.*", line.lower()):
        flag = True
    if re.match("^be\s+it\s+enacted\s+by.*", line.lower()):
        str2 = str2.strip()
        flag = False
        break
    if flag == True:
        str2 += line.strip() + ' '
dict["Act Definition"] = str2

In [7]:
dict["Act Definition"]

'An Act to regulate the procedure to be followed by the National Judicial Appointments Commission for recommending persons for appointment as the Chief Justice of India and other Judges of the Supreme Court and Chief Justices and other Judges of High Courts and for their transfers and for matters connected therewith or incidental thereto.'

### Add to dictionary the list of sections

In [8]:
list_of_sections = []
flag = False
dict['Sections'] = {}

for line in Lines:
    if line.strip() == 'SECTIONS':
        flag = True
    if re.match("^be\s+it\s+enacted\s+by.*", line.lower()):
        break
    if flag:
        if re.match("^[0-9]+\.\s*[a-z].*", line.lower()):
            mo = re.search(r"(([0-9]+)\.\s*([a-zA-Z].*))", line)
            sec_name = mo.group(3).strip()
        
            inc = 1
            while True:
                if sec_name[-1]!='.':
                    sec_name = sec_name + ' ' + Lines[i+inc].strip()
                    inc += 1
                else:
                    break

            list_of_sections.append(sec_name)
            
            key = 'Section ' + mo.group(2).strip() + '.'
            dict['Sections'][key] = {'name': sec_name, 'content': ''}

In [9]:
dict

{'Act ID': 'ACT NO. 40 OF 2014',
 'Act Title': 'THE NATIONAL JUDICIAL APPOINTMENTS COMMISSION ACT, 2014',
 'Act Definition': 'An Act to regulate the procedure to be followed by the National Judicial Appointments Commission for recommending persons for appointment as the Chief Justice of India and other Judges of the Supreme Court and Chief Justices and other Judges of High Courts and for their transfers and for matters connected therewith or incidental thereto.',
 'Sections': {'Section 1.': {'name': 'Short title and commencement.',
   'content': ''},
  'Section 2.': {'name': 'Definitions.', 'content': ''},
  'Section 3.': {'name': 'Headquarters of Commission.', 'content': ''},
  'Section 4.': {'name': 'Reference to Commission for filling up of vacancies.',
   'content': ''},
  'Section 5.': {'name': 'Procedure for selection of Judge of Supreme Court.',
   'content': ''},
  'Section 6.': {'name': 'Procedure for selection of Judge of High Court.',
   'content': ''},
  'Section 7.': {'nam

### Create a new document for the purpose of extracting content: 
We need to create a new document without any newlines so we can detect name of sections that span in multiple lines and hence extract their content

In [10]:
for i, line in enumerate(Lines):
    if re.search(r"be\s+it\s+enacted\s+by.*", line.lower())!= None:
        start_index = i
        break
        
new_list = Lines[start_index:]

new_doc = ' '.join(new_list)

In [11]:
list_of_sections

['Short title and commencement.',
 'Definitions.',
 'Headquarters of Commission.',
 'Reference to Commission for filling up of vacancies.',
 'Procedure for selection of Judge of Supreme Court.',
 'Procedure for selection of Judge of High Court.',
 'Power of President to require reconsideration.',
 'Officers and employees of Commission.',
 'Procedure for transfer of Judges.',
 'Procedure to be followed by Commission in discharge of its functions.',
 'Power to make rules.',
 'Power to make regulations.',
 'Rules and regulations to be laid before Parliament.',
 'Power to remove difficulties.']

In [17]:
contents = []

for i, section in enumerate(list_of_sections):
    position1 = (new_doc.find(section))
    begin = position1 + len(section)
    if i == len(list_of_sections)-1:
        x = new_doc[begin:]
        contents.append(x.strip())
    else:
        pattern = r"[0-9]+\.\s*" + list_of_sections[i+1]
        if re.search(pattern, new_doc):
            end = re.search(pattern, new_doc).start()
            x = new_doc[begin:end]
            contents.append(x.strip())
        else:
            print("Pattern Not found!!!")
        

In [18]:
contents[8]

'The Commission shall recommend for transfer of Chief Justices and other Judges of High Courts from one High Court to any other High Court, and for this purpose, specify, by regulations, the procedure for such transfer.'

### Add the content of each section to the dictionary

In [19]:
index = 0
for i in dict['Sections'].keys():
    dict['Sections'][i]['content'] = contents[index]
    index += 1

In [20]:
dict

{'Act ID': 'ACT NO. 40 OF 2014',
 'Act Title': 'THE NATIONAL JUDICIAL APPOINTMENTS COMMISSION ACT, 2014',
 'Act Definition': 'An Act to regulate the procedure to be followed by the National Judicial Appointments Commission for recommending persons for appointment as the Chief Justice of India and other Judges of the Supreme Court and Chief Justices and other Judges of High Courts and for their transfers and for matters connected therewith or incidental thereto.',
 'Sections': {'Section 1.': {'name': 'Short title and commencement.',
   'content': '(1) This Act may be called the National Judicial Appointments Commission Act, 2014. (2) It shall come into force on such date1 as the Central Government may, by notification in the Official Gazette, appoint.'},
  'Section 2.': {'name': 'Definitions.',
   'content': 'In this Act, unless the context otherwise requires, (a) Chairperson means the Chairperson of the Commission; (b) Commission means the National Judicial Appointments Commission refe

### Create a json format using json.dumps() and write it to a file

In [21]:
import json

json_object = json.dumps(dict, indent = 4)   

f = open("FileToJSON.json","w")
f.write(json_object)
f.close()