In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [2]:
d = {'Article': [], 'Title': [], 'Paragraphs':[]}
df = pd.DataFrame(data=d)

In [3]:
# Create empty dataframe with three columns
df=pd.DataFrame(columns = ["Article", "Title", "Paragraphs"])
df

Unnamed: 0,Article,Title,Paragraphs


In [4]:
# Extracts data from the CRR (HTML format) and put the articles with titles 
# and paragraphs into the DF

page = requests.get("http://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32013R0575&from=en")
soup = BeautifulSoup(page.content, 'lxml')

title, subtitle, para = '', '', ''
i = 1
for p in soup.find_all('p', class_=['ti-art', 'sti-art', 'normal']):
    if p['class'][0] == 'ti-art':
        if title:
            df.loc[i] = [title, subtitle, para]
            i+= 1
        title = p.text
        para = ''
        continue
    if p['class'][0] == 'sti-art':
        subtitle = p.text
        continue

    para += p.text + ' '

In [5]:
df.head()

Unnamed: 0,Article,Title,Paragraphs
1,Article 1,Scope,This Regulation lays down uniform rules concer...
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ..."
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ..."


In [6]:
def references(text):
    res = re.findall(r'(?=(?P<section>Articles?\W+(\w+)(\(\w+\))?((,| and) (\w+)(\(\w+\))?)*))(?P=section)(?! of)', text)
    res2 = re.findall(r'(?=(?P<section>Article?\W+(\w+)(\(\w+\))))(?P=section)(?! of)', text)
    res.append(res2)
    return res

In [7]:
df['References_internal'] = df['Paragraphs'].apply(references)

In [8]:
df.head()

Unnamed: 0,Article,Title,Paragraphs,References_internal
1,Article 1,Scope,This Regulation lays down uniform rules concer...,"[(Article 460, 460, , , , , ), []]"
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...,[[]]
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...,[[]]
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ...","[(Article 4(1), 4, (1), , , , ), (Article 4(1)..."
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ...",[[]]


In [9]:
def clean_references(text2):
    res2 = re.findall('\d+(?!\))', text2)
    return res2

In [10]:
def f4(seq): 
    noDupes = []
    [noDupes.append(i) for i in seq if not noDupes.count(i)]
    return noDupes

In [11]:
df['References_internal'] = df['References_internal'].astype(str)

In [12]:
df['References_internal_clean'] = df['References_internal'].apply(clean_references)

In [13]:
df['References_internal_clean'] = df['References_internal_clean'].apply(f4)

In [14]:
df.head()

Unnamed: 0,Article,Title,Paragraphs,References_internal,References_internal_clean
1,Article 1,Scope,This Regulation lays down uniform rules concer...,"[('Article 460', '460', '', '', '', '', ''), []]",[460]
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...,[[]],[]
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...,[[]],[]
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ...","[('Article 4(1)', '4', '(1)', '', '', '', ''),...","[4, 2, 115, 25, 71, 301, 113, 1]"
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ...",[[]],[]


In [15]:
# Save to CSV
header = ["Article", "Title", "Paragraphs", "References_internal", "References_internal_clean"]
df.to_csv("CRR_regulation_references.csv", sep=',', encoding='utf8', columns = header)