In [1]:
!pip install --quiet pypdf

In [2]:
from pypdf import PdfReader

In [3]:
reader = PdfReader('pdf/TheConstitutionOfKenya.pdf')

In [4]:
len(reader.pages)

166

In [5]:
sample = reader.pages[22]

print(sample.extract_text())

[Rev. 2022] Constitution of Kenya 23
Human dignity.
28.  Every person has inherent dignity and the right to have that dignity respected
and protected.
Freedom and security of the person.
29.  Every person has the right to freedom and security of the person, which
includes the right not to be—
(a) deprived of freedom arbitrarily or without just cause;
(b) detained without trial, except during a state of emergency, in which
case the detention is subject to Article 58;
(c) subjected to any form of violence from either public or private sources;
(d) subjected to torture in any manner, whether physical or psychological;
(e) subjected to corporal punishment; or
(f)treated or punished in a cruel, inhuman or degrading manner.
Slavery, servitude and forced labour.
30.   (1)  A person shall not be held in slavery or servitude.
(2)  A person shall not be required to perform forced labour.
Privacy.
31.  Every person has the right to privacy, which includes the right not to have—
(a) their person, 

In [6]:
page_texts = [p.extract_text() for p in reader.pages]

In [7]:
relevant_text = "\n".join(page_texts[13:123])

In [8]:
import io

mem_file = io.StringIO(relevant_text)
all_lines = [l for l in mem_file]

In [9]:
len(all_lines)

4657

In [10]:
import re

article_num_pattern = re.compile(r'(\d+)\.\s+\S+')

In [11]:
article_num_pattern.match('22. x y z').group(1)

'22'

In [12]:
article_nums = []

for line in all_lines:
    line = line.strip()
    match = article_num_pattern.match(line)
    if match:
        article_nums.append(int(match.group(1)))

In [13]:
len(article_nums)

264

In [14]:
assert [i+1 for i in range(264)] == article_nums

In [15]:
articles = []

for i, line in enumerate(all_lines):
    line = line.strip()
    match = article_num_pattern.match(line)
    if match:
        articles.append((int(match.group(1)), all_lines[i-1].strip()))

In [16]:
articles

[(1, 'Sovereignty of the people.'),
 (2, 'Supremacy of this Constitution.'),
 (3, 'Defence of this Constitution.'),
 (4, 'Declaration of the Republic.'),
 (5, 'Territory of Kenya.'),
 (6, 'Devolution and access to services.'),
 (7, 'National, ofﬁcial and other languages.'),
 (8, 'State and religion.'),
 (9, 'National symbols and national days.'),
 (10, 'National values and principles of governance.'),
 (11, 'Culture.'),
 (12, 'Entitlements of citizens.'),
 (13, 'Retention and acquisition of citizenship.'),
 (14, 'Citizenship by birth.'),
 (15, 'Citizenship by registration.'),
 (16, 'Dual citizenship.'),
 (17, 'Revocation of citizenship.'),
 (18, 'Legislation on citizenship.'),
 (19, 'Rights and fundamental freedoms.'),
 (20, 'Application of Bill of Rights.'),
 (21, 'Implementation of rights and fundamental freedoms.'),
 (22, 'Enforcement of Bill of Rights.'),
 (23, 'Authority of courts to uphold and enforce the Bill of Rights.'),
 (24, 'Limitation of rights and fundamental freedoms.'),

In [17]:
chapter_pattern = re.compile(r'CHAPTER')

chapters = []

for i, line in enumerate(all_lines):
    line = line.strip()
    if chapter_pattern.match(line):
        chapters.append((line, all_lines[i+1].strip()))

In [18]:
chapters

[('CHAPTER ONE',
  'SOVEREIGNTY OF THE PEOPLE AND SUPREMACY OF THIS CONSTITUTION'),
 ('CHAPTER TWO', 'THE REPUBLIC'),
 ('CHAPTER THREE', 'CITIZENSHIP'),
 ('CHAPTER FOUR', 'THE BILL OF RIGHTS'),
 ('CHAPTER FIVE', 'LAND AND ENVIRONMENT'),
 ('CHAPTER SIX', 'LEADERSHIP AND INTEGRITY'),
 ('CHAPTER SEVEN', 'REPRESENTATION OF THE PEOPLE'),
 ('CHAPTER EIGHT', 'THE LEGISLATURE'),
 ('CHAPTER NINE', 'THE EXECUTIVE'),
 ('CHAPTER TEN', 'JUDICIARY'),
 ('CHAPTER ELEVEN', 'DEVOLVED GOVERNMENT'),
 ('CHAPTER TWELVE', 'PUBLIC FINANCE'),
 ('CHAPTER THIRTEEN', 'THE PUBLIC SERVICE'),
 ('CHAPTER FOURTEEN', 'NATIONAL SECURITY'),
 ('CHAPTER FIFTEEN', 'COMMISSIONS AND INDEPENDENT OFFICES'),
 ('CHAPTER SIXTEEN', 'AMENDMENT OF THIS CONSTITUTION'),
 ('CHAPTER SEVENTEEN', 'GENERAL PROVISIONS'),
 ('CHAPTER EIGHTEEN', 'TRANSITIONAL AND CONSEQUENTIAL PROVISIONS')]

In [19]:
part_pattern = re.compile(r'PART')

for line in all_lines:
    line = line.strip()
    if part_pattern.match(line):
        print(line)

PART 1 – GENERAL PROVISIONS TO THE BILL OF RIGHTS
PART 2 – RIGHTS AND FUNDAMENTAL FREEDOMS
PART 3 – SPECIFIC APPLICATION OF RIGHTS
PART 4 – STATE OF EMERGENCY
PART 5 – KENYA NATIONAL HUMAN RIGHTS AND EQUALITY COMMISSION
PART 1 – LAND
PART 2 – ENVIRONMENT AND NATURAL RESOURCES
PART 1 – ELECTORAL SYSTEM AND PROCESS
PART 2 – INDEPENDENT ELECTORAL AND BOUNDARIES
PART 3 – POLITICAL PARTIES
PART 1 – ESTABLISHMENT AND ROLE OF PARLIAMENT
PART 2 – COMPOSITION AND MEMBERSHIP OF PARLIAMENT
PART 3 – OFFICES OF PARLIAMENT
PART 4 – PROCEDURES FOR ENACTING LEGISLATION
PART 5 – PARLIAMENT'S GENERAL PROCEDURES AND RULES
PART 6 – MISCELLANEOUS
PART 1 – PRINCIPLES AND STRUCTURE OF THE NATIONAL EXECUTIVE
PART 2 – THE PRESIDENT AND DEPUTY PRESIDENT
PART 3 – THE CABINET
PART 4 – OTHER OFFICES
PART 1 – JUDICIAL AUTHORITY AND LEGAL SYSTEM
PART 2 – SUPERIOR COURTS
PART 3 – SUBORDINATE COURTS
PART 4 – JUDICIAL SERVICE COMMISSION
PART 1 – OBJECTS AND PRINCIPLES OF DEVOLVED GOVERNMENT
PART 2 – COUNTY GOVERNMENTS


In [20]:
header_pattern = re.compile(r'\[Rev\.\s+2022\]')

In [21]:
for line in all_lines:
    line = line.strip()
    if header_pattern.search(line):
        print(line, end="       ")

14 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 15       16 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 17       18 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 19       20 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 21       22 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 23       24 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 25       26 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 27       28 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 29       30 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 31       32 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 33       34 Constitution of Kenya [Rev. 2022]       [Rev. 2022] Constitution of Kenya 35       36 Constitution of Kenya [Rev. 2022]       [Rev. 2022]

In [22]:
relevant_lines = [l for l in all_lines if not header_pattern.search(l)]

In [23]:
len(relevant_lines)

4547

In [60]:
relevant_lines[:30]

['CHAPTER ONE\n',
 'SOVEREIGNTY OF THE PEOPLE AND SUPREMACY OF THIS CONSTITUTION\n',
 'Sovereignty of the people.\n',
 '1. \xa0 (1)\xa0\xa0All sovereign power belongs to the people of Kenya and shall be exercised\n',
 'only in accordance with this Constitution.\n',
 '(2)\xa0\xa0The people may exercise their sovereign power either directly or through their\n',
 'democratically elected representatives.\n',
 '(3)\xa0\xa0Sovereign power under this Constitution is delegated to the following State\n',
 'organs, which shall perform their functions in accordance with this Constitution—\n',
 '(a) Parliament and the legislative assemblies in the county governments;\n',
 '(b) the national executive and the executive structures in the county\n',
 'governments; and\n',
 '(c) the Judiciary and independent tribunals.\n',
 '(4)\xa0\xa0The sovereign power of the people is exercised at—\n',
 '(a) the national level; and\n',
 '(b) the county level.\n',
 'Supremacy of this Constitution.\n',
 '2. \xa0 (1)\

In [77]:
import unicodedata

l = relevant_lines[3]
unicodedata.normalize('NFKC', l), l

('1.   (1)  All sovereign power belongs to the people of Kenya and shall be exercised\n',
 '1. \xa0 (1)\xa0\xa0All sovereign power belongs to the people of Kenya and shall be exercised\n')

In [78]:
relevant_lines = [unicodedata.normalize('NFKC', l) for l in relevant_lines]
relevant_lines[:30]

['CHAPTER ONE\n',
 'SOVEREIGNTY OF THE PEOPLE AND SUPREMACY OF THIS CONSTITUTION\n',
 'Sovereignty of the people.\n',
 '1.   (1)  All sovereign power belongs to the people of Kenya and shall be exercised\n',
 'only in accordance with this Constitution.\n',
 '(2)  The people may exercise their sovereign power either directly or through their\n',
 'democratically elected representatives.\n',
 '(3)  Sovereign power under this Constitution is delegated to the following State\n',
 'organs, which shall perform their functions in accordance with this Constitution—\n',
 '(a) Parliament and the legislative assemblies in the county governments;\n',
 '(b) the national executive and the executive structures in the county\n',
 'governments; and\n',
 '(c) the Judiciary and independent tribunals.\n',
 '(4)  The sovereign power of the people is exercised at—\n',
 '(a) the national level; and\n',
 '(b) the county level.\n',
 'Supremacy of this Constitution.\n',
 '2.   (1)  This Constitution is the supr

todo: Iterate through all lines classifying articles by chapter and part (if exists)

In [79]:
current_chapter, current_part, current_article = None, None, None
article_list = []
index = 0

def is_article_title(index):
    next_index = index + 1
    if next_index < len(relevant_lines):
        next_line = relevant_lines[next_index].strip()
        return article_num_pattern.match(next_line) is not None
    return True  


while index < len(relevant_lines):
    line = relevant_lines[index].strip()
    if chapter_pattern.match(line):
        # reset part, article
        current_part = None

        if current_article:
            # append to list before reseting
            article_list.append(current_article)
        current_article = None

        # Current line is the chapter num, next line is the title
        current_chapter = (line, relevant_lines[index+1].strip())
        index += 1 # swallow the title line
    elif part_pattern.match(line):
        current_part = line
    elif match := article_num_pattern.match(line):
        
        if current_article:
            # append previous article before starting new one
            article_list.append(current_article)
            
        current_article = dict(
            number=int(match.group(1)),
            title=relevant_lines[index-1],
            lines=[relevant_lines[index]],
            part=current_part,
            chapter=current_chapter,
        )
    else:
        # not the beginning of any section
        
        # if we have a current_article, then this
        # is likely an additional line of the article.
         
        # However, since a new article's title precedes
        # the article number, check that possibility first
        if current_article and not is_article_title(index):
            current_article['lines'].append(relevant_lines[index])
    index += 1

# append last article
article_list.append(current_article)

In [80]:
len(article_list)

264

In [81]:
articles = []

for i, line in enumerate(relevant_lines):
    line = line.strip()
    match = article_num_pattern.match(line)
    if match:
        articles.append((int(match.group(1)), relevant_lines[i-1].strip()))

In [82]:
len(articles)

264

In [83]:
for article in article_list[-4:]:
    pprint(article)
    print('*******')

{'chapter': ('CHAPTER EIGHTEEN', 'TRANSITIONAL AND CONSEQUENTIAL PROVISIONS'),
 'lines': ['261.   (1)  Parliament shall enact any legislation required by '
           'this Constitution\n',
           'to be enacted to govern a particular matter within the period '
           'specified in the Fifth\n',
           'Schedule, commencing on the effective date.\n',
           '(2)  Despite clause (1), the National Assembly may, by resolution '
           'supported by\n',
           'the votes of at least two-thirds of all the members of the '
           'National Assembly, extend\n',
           'the period prescribed in respect of any particular matter under '
           'clause (1), by a period\n',
           'not exceeding one year.\n',
           '(3)  The power of the National Assembly contemplated under clause '
           '(2), may be\n',
           'exercised—\n',
           '(a) only once in respect of any particular matter; and\n',
           '(b) only in exceptional circumstanc

In [84]:
for i in range(len(article_list)):
    article_num = article_list[i]['number']
    assert article_num == i+1