In [2]:
import pdfplumber, json
from fastcore.all import *
from configparser import ConfigParser

In [3]:
config = ConfigParser()
config.read('settings.ini')
cfg = config['DEFAULT']

In [4]:
def download_pdf(url, dest):
    if not Path(dest).exists(): urlsave(url, dest)
    return dest

In [5]:
path = download_pdf(cfg['url'], cfg['dest'])

In [6]:
def get_pdf_text(path):
    with pdfplumber.open(path) as pdf: res = L(pdf.pages).map(lambda x: x.extract_text())
    return res

In [7]:
pdf_text = get_pdf_text(path)

In [8]:
def remove_header(s): return remove_prefix(s, 'La Constitución \n')

In [9]:
pat_pagenum = re.compile(r'\n\s+\d+\s*$')
def remove_pagenum(s): return pat_pagenum.sub('', s)

In [10]:
def process_page(s):
    pipe = [remove_header, remove_pagenum]
    for f in pipe: s = f(s)
    return s

In [11]:
proc_pages = pdf_text.map(process_page)

In [12]:
pat_footnote = re.compile(r'\n\d+\s(?:.|\s)*')
pat_footnote_n = re.compile(r'(\d+)\s.*')

def split_footnote(s):
    mat = pat_footnote_n.match(s)
    return (int(mat.group(1)),s[mat.end(1):].strip())

def extract_footnotes(s):
    footnote = pat_footnote.search(s)
    footnotes = []
    
    if footnote is not None:
        footnotes.append(footnote.group().strip())
        mat = pat_footnote_n.match(footnotes[0])
        n = int(mat.group(1))
        
        while True:
            pat = f'\n({int(mat.group(1))+1}) '
            mat = re.search(pat, footnotes[-1])
            if mat is None: break
            i = mat.start(1)
            footnotes.append(footnotes[-1][i:])
            footnotes[-2] = footnotes[-2][:i]
            
        footnotes = L(footnotes).map(lambda x: x.strip().replace('\n', '')).map(split_footnote)
        
    new_s = pat_footnote.sub('',s)
    footnotes = dict(footnotes)
    
    for idx in footnotes.keys():
        pat = f'(?<!Artículo\s)[^\d\n]({idx})[^\d]'
        mat = re.search(pat, new_s)
        assert len(mat.groups()) == 1
        i,j = mat.start(1),mat.end(1)
        new_s = f'{new_s[:i]}[^{idx}]{new_s[j:]}'
    
    return new_s,footnotes

In [13]:
proc_text,proc_footnotes = zip(*proc_pages.map(extract_footnotes))
# proc_text = ''.join([o.strip() for o in proc_text])
proc_text = ''.join(proc_text)
proc_footnotes = merge(*proc_footnotes)

In [14]:
def roman_to_int(s):
    s = s.upper()
    rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
    int_val = 0
    for i in range(len(s)):
        if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]: int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]]
        else                                          : int_val += rom_val[s[i]]
    return int_val

Extract intro

In [15]:
pat_title = re.compile(r'(TITULO)\s(\w+)\b')
pat_ref = re.compile(r'\[\^(\d+)\]')

In [16]:
source_text = proc_text

In [17]:
intro = {}
pat = re.compile(r'\w+\b')
mat = pat.match(source_text)
intro['title'] = mat.group()
source_text = source_text[mat.end():]
mat = pat_title.search(source_text)
intro['text'] = source_text[:mat.start()].strip()
n = int(pat_ref.search(intro['text']).group(1))
intro['footnotes'] = [{'ref': n, 'text': proc_footnotes[n]}]
source_text = source_text[mat.start():]
intro

{'title': 'PREAMBULO',
 'text': 'El  Congreso  Constituyente  Democrático,  invocando  a  Dios  Todopoderoso, \nobedeciendo  el  mandato  del  pueblo  peruano  y  recordando  el  sacrificio  de  todas  las \ngeneraciones  que  nos  han  precedido  en  nuestra  Patria,  ha  resuelto  dar  la  siguiente \nConstitución: \n \nCONSTITUCION POLITICA DEL PERU DE 1993 [^1]',
 'footnotes': [{'ref': 1,
   'text': 'De conformidad con el artículo 1 de la Ley N.º 27600, publicada el 16 diciembre 2001 en el diario oficial El Peruano, se suprime la firma de Alberto Fujimori Fujimori, del texto de la Constitución Política del Estado de 1993.'}]}

In [18]:
pat_title_cont = re.compile(r'(TITULO)\s(\w+)[\s\n]+([^\n]+)')

def get_titles(text):
    titles = {}
    mat = pat_title_cont.search(text)

    while True:
        n = roman_to_int(mat.group(2))
        titles[n] = {'name': mat.group(3).strip()}
        text = text[mat.end():]
        mat = pat_title_cont.search(text)
        titles[n]['text'] = text if mat is None else text[:mat.start()]
        if mat is None: break
            
    return titles

In [19]:
titles = get_titles(source_text)

In [20]:
pat_chapter_cont = re.compile(r'(CAPITULO)\s(\w+)[\s\n]+([^\n]+)')

def get_chapters(text):
    chapters = {}
    mat = pat_chapter_cont.search(text)
    if mat is None:
        chapters['text'] = text
        return chapters

    while True:
        n = roman_to_int(mat.group(2))
        chapters[n] = {'name': mat.group(3).strip()}
        text = text[mat.end():]
        mat = pat_chapter_cont.search(text)
        chapters[n]['text'] = text if mat is None else text[:mat.start()]
        if mat is None: break
            
    return chapters

def get_all_chapters(data):
    return [{'title': k, 'chapters': get_chapters(v['text'])} for k,v in data.items()]

In [21]:
chapters = get_all_chapters(titles)

Special end sections

In [22]:
end_titles = ['DISPOSICIONES FINALES Y TRANSITORIAS', 'DISPOSICIONES TRANSITORIAS ESPECIALES', 'DECLARACION']

def get_end_sections(text, titles):
    idxs = L(titles).map(lambda x: re.search(x, text)).map(lambda x: (x.start(),x.end()))
    new_text = text[:idxs[0][0]]
    idxs = list(chunked(idxs.concat()[1:] + [None], 2))
    sections = [{'title': t, 'text': text[i:j].strip()} for t,(i,j) in zip(titles,idxs)]
    return new_text,sections

In [23]:
new_end,end_sections = get_end_sections(chapters[-1]['chapters']['text'], end_titles)
chapters[-1]['chapters']['text'] = new_end

Get articles

In [24]:
pat_art_cont = re.compile(r'(Artículo)\s+(\d+)\.-[\s\n]+([^\n]+)')

def get_articles(text):
    articles = {}
    mat = pat_art_cont.search(text)

    while True:
        n = int(mat.group(2))
        articles[n] = {'name': mat.group(3).strip()}
        text = text[mat.end():]
        mat = pat_art_cont.search(text)
        articles[n]['text'] = (text if mat is None else text[:mat.start()]).strip()
        if mat is None: break
            
    return articles

def get_all_articles(data):
    res = []
    for titles in data:
        title = titles['title']
        for chapter_n,chapter in titles['chapters'].items():
            articles = get_articles(chapter) if chapter_n == 'text' else get_articles(chapter['text'])
            if chapter_n == 'text': chapter_n = None
            for n,article in articles.items():
                d = {'title': title, 'chapter': chapter_n , 'article': n, **article}
                res.append(d)
            
    return res

In [25]:
articles = get_all_articles(chapters)

In [26]:
def add_references(data, footnotes):
    for d in data:
        refs = pat_ref.findall(d['text'])
        refs = [{'ref':int(ref), 'text':footnotes[int(ref)]} for ref in refs]
        d['footnotes'] = refs

In [27]:
add_references(articles, proc_footnotes)

In [28]:
add_references(end_sections, proc_footnotes)

Names

In [29]:
def get_chapter_names(x):
    if 'text' in x['chapters']: return {}
    return {k:v['name'] for k,v in x['chapters'].items()}

names = {k:{'name': v['name'], 'chapters': get_chapter_names(chapters[k-1])} for k,v in titles.items()}

In [37]:
objs = [names, intro, articles, end_sections]
names = ['names', 'intro', 'articles', 'end_sections']
out_path = Path('../src/data')
out_path.mkdir(exist_ok=True)

In [45]:
for o,n in zip(objs,names):
    with open(out_path / (n+'.json'), 'w') as f: json.dump(o, f)