In [214]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [215]:
d = {'Article': [], 'Title': [], 'Paragraphs':[]}
df = pd.DataFrame(data=d)

In [216]:
# Create empty dataframe with three columns
df=pd.DataFrame(columns = ["Article", "Title", "Paragraphs"])
df

Unnamed: 0,Article,Title,Paragraphs


In [217]:
# Extracts data from the CRR (HTML format) and put the articles with titles 
# and paragraphs into the DF

page = requests.get("http://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32013R0575&from=en")
soup = BeautifulSoup(page.content, 'lxml')

title, subtitle, para = '', '', ''
i = 1
for p in soup.find_all('p', class_=['ti-art', 'sti-art', 'normal']):
    if p['class'][0] == 'ti-art':
        if title:
            df.loc[i] = [title, subtitle, para]
            i+= 1
        title = p.text
        para = ''
        continue
    if p['class'][0] == 'sti-art':
        subtitle = p.text
        continue

    para += p.text + ' '

In [218]:
df.head()

Unnamed: 0,Article,Title,Paragraphs
1,Article 1,Scope,This Regulation lays down uniform rules concer...
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ..."
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ..."


In [219]:
def references(text):
    res = re.findall(r'(?=(?P<section>Articles?\W+(\w+)(\(\w+\))?((,| and) (\w+)(\(\w+\))?)*))(?P=section)(?! of)', text)
    res2 = re.findall(r'(?=(?P<section>Article?\W+(\w+)(\(\w+\))))(?P=section)(?! of)', text)
    res.append(res2)
    return res

In [220]:
df['References_internal'] = df['Paragraphs'].apply(references)

In [221]:
df.head()

Unnamed: 0,Article,Title,Paragraphs,References_internal
1,Article 1,Scope,This Regulation lays down uniform rules concer...,"[(Article 460, 460, , , , , ), []]"
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...,[[]]
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...,[[]]
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ...","[(Article 4(1), 4, (1), , , , ), (Article 4(1)..."
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ...",[[]]


In [222]:
def clean_references(text2):
    res2 = re.findall('\d+(?!\))', text2)
    return res2

In [223]:
def f4(seq): 
    noDupes = []
    [noDupes.append(i) for i in seq if not noDupes.count(i)]
    return noDupes

In [224]:
df['References_internal'] = df['References_internal'].astype(str)

In [225]:
df['References_internal_clean'] = df['References_internal'].apply(clean_references)

In [226]:
df['References_internal_clean'] = df['References_internal_clean'].apply(f4)

In [227]:
def references_replaced(text):
    res = re.sub(r'(?=(?P<section>Articles?\W+(\w+)(\(\w+\))?((,| and) (\w+)(\(\w+\))?)*))(?P=section)(?! of)', '', text)
    res2 = re.sub(r'(?=(?P<section>Article?\W+(\w+)(\(\w+\))))(?P=section)(?! of)', '', res)
    return res2

In [228]:
df['Paragraphs_cleaned'] = df['Paragraphs'].apply(references_replaced)

In [229]:
df.head()

Unnamed: 0,Article,Title,Paragraphs,References_internal,References_internal_clean,Paragraphs_cleaned
1,Article 1,Scope,This Regulation lays down uniform rules concer...,"[('Article 460', '460', '', '', '', '', ''), []]",[460],This Regulation lays down uniform rules concer...
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...,[[]],[],For the purposes of ensuring compliance with t...
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...,[[]],[],This Regulation shall not prevent institutions...
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ...","[('Article 4(1)', '4', '(1)', '', '', '', ''),...","[4, 2, 115, 25, 71, 301, 113, 1]","1. For the purposes of this Regulation, the ..."
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ...",[[]],[],"For the purposes of Part Three, Title II, the ..."


In [15]:
# Save to CSV
header = ["Article", "Title", "Paragraphs", "References_internal", "References_internal_clean"]
df.to_csv("CRR_regulation_references.csv", sep=',', encoding='utf8', columns = header)

In [230]:
from nltk.stem.snowball import SnowballStemmer 
df['tokenized'] = df['Paragraphs_cleaned'].astype(str).str.split()
stemmer = SnowballStemmer("english") 
df['stemmed'] = df['tokenized'].apply(lambda x: [stemmer.stem(y) for y in x])

In [231]:
del df['tokenized']
del df['References_internal']

In [232]:
df.head()

Unnamed: 0,Article,Title,Paragraphs,References_internal_clean,Paragraphs_cleaned,stemmed
1,Article 1,Scope,This Regulation lays down uniform rules concer...,[460],This Regulation lays down uniform rules concer...,"[this, regul, lay, down, uniform, rule, concer..."
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...,[],For the purposes of ensuring compliance with t...,"[for, the, purpos, of, ensur, complianc, with,..."
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...,[],This Regulation shall not prevent institutions...,"[this, regul, shall, not, prevent, institut, f..."
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ...","[4, 2, 115, 25, 71, 301, 113, 1]","1. For the purposes of this Regulation, the ...","[1., for, the, purpos, of, this, regulation,, ..."
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ...",[],"For the purposes of Part Three, Title II, the ...","[for, the, purpos, of, part, three,, titl, ii,..."


In [233]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
#import mpld3

In [234]:
paragraphs = df['Paragraphs_cleaned'].tolist()
articles = df['Article'].tolist()
internal_references = df['References_internal_clean'].tolist()

In [235]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, ngram_range=(1,3))

#tokenizer=tokenize_and_stem

tfidf_matrix = tfidf_vectorizer.fit_transform(paragraphs) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

(524, 59)


In [236]:
terms = tfidf_vectorizer.get_feature_names()

In [237]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print(dist)

[[ 0.00000000e+00  6.01156660e-01  6.04823963e-01 ...  6.52632556e-01
   7.26070350e-01  8.84983268e-01]
 [ 6.01156660e-01  0.00000000e+00  6.53937797e-01 ...  5.50781798e-01
   4.90104691e-01  8.86698911e-01]
 [ 6.04823963e-01  6.53937797e-01  0.00000000e+00 ...  5.36497097e-01
   7.69091088e-01  9.18418571e-01]
 ...
 [ 6.52632556e-01  5.50781798e-01  5.36497097e-01 ... -2.22044605e-16
   4.88642044e-01  6.79274471e-01]
 [ 7.26070350e-01  4.90104691e-01  7.69091088e-01 ...  4.88642044e-01
  -2.22044605e-16  4.68592992e-01]
 [ 8.84983268e-01  8.86698911e-01  9.18418571e-01 ...  6.79274471e-01
   4.68592992e-01 -2.22044605e-16]]


In [238]:
#K-means clustering
from sklearn.cluster import KMeans
num_clusters = 100
km = KMeans(n_clusters=num_clusters)
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

Wall time: 1.83 s


In [239]:
from sklearn.externals import joblib

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [244]:
crr_articles = { 'article': articles, 'paragraph': paragraphs, 'cluster': clusters, 'internal_references': internal_references}

frame = pd.DataFrame(crr_articles, index = [clusters], columns = ['article', 'cluster', 'internal_references'])

In [263]:
frame.head(10)

Unnamed: 0,article,cluster,internal_references,assigned_articles
93,Article 1,93,[460],"[1, 397]"
79,Article 2,79,[],"[2, 146, 295, 380, 407, 414]"
83,Article 3,83,[],"[3, 436]"
19,Article 4,19,"[4, 2, 115, 25, 71, 301, 113, 1]","[4, 12, 98, 118, 142, 349, 405, 411, 450]"
67,Article 5,67,[],"[5, 192, 392]"
66,Article 6,66,"[19, 89, 90, 91, 508, 95, 7, 96]","[6, 16, 459]"
36,Article 7,36,"[6, 11]","[7, 8, 9, 10, 17, 18, 49, 314, 396]"
36,Article 8,36,"[21, 113]","[7, 8, 9, 10, 17, 18, 49, 314, 396]"
36,Article 9,36,"[6, 7]","[7, 8, 9, 10, 17, 18, 49, 314, 396]"
36,Article 10,36,[],"[7, 8, 9, 10, 17, 18, 49, 314, 396]"


In [246]:
def assigned_articles(number):
    res = []
    for article in frame.ix[number]['article'].values.tolist():
        res.append(article)
    return res

In [247]:
frame['assigned_articles'] = frame['cluster'].apply(assigned_articles)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [264]:
frame.head()

Unnamed: 0,article,cluster,internal_references,assigned_articles
93,Article 1,93,[460],"[1, 397]"
79,Article 2,79,[],"[2, 146, 295, 380, 407, 414]"
83,Article 3,83,[],"[3, 436]"
19,Article 4,19,"[4, 2, 115, 25, 71, 301, 113, 1]","[4, 12, 98, 118, 142, 349, 405, 411, 450]"
67,Article 5,67,[],"[5, 192, 392]"


In [257]:
frame['assigned_articles'] = frame['assigned_articles'].astype(str)
frame['assigned_articles'] = frame['assigned_articles'].apply(clean_references)

In [262]:
joblib.dump(frame,'kmeans_cluster_frame.pkl')

['kmeans_cluster_frame.pkl']

In [265]:
frame["matches"] = [set(frame.loc[r, "internal_references"]) & set(frame.loc[r, "assigned_articles"]) for r in range(len(frame))]

TypeError: unhashable type: 'list'

In [181]:
from __future__ import print_function

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d articles:" % i, end='')
    for article in frame.ix[i]['article'].values.tolist():
        print(' %s,' % article, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Cluster 0 articles: Article 296, Article 298, Article 416, Article 418, Article 421, Article 422, Article 425, Article 509,

Cluster 1 articles: Article 101, Article 124, Article 143, Article 144, Article 150, Article 164, Article 173, Article 394, Article 495,

Cluster 2 articles: Article 212, Article 248, Article 308, Article 309, Article 342, Article 378, Article 391,

Cluster 3 articles: Article 129, Article 160, Article 201, Article 202, Article 247, Article 254, Article 399,

Cluster 4 articles: Article 303, Article 326, Article 327, Article 328, Article 333, Article 334, Article 337, Article 348, Article 350, Article 371, Article 373,

Cluster 5 articles: Article 34, Article 89, Article 465, Article 492, Article 499,

Cluster 6 articles: Article 51, Article 64, Article 198, Article 483, Article 486,

Cluster 7 articles: Article 217, Article 299, Article 403, Article 444,

Cluster 8 articles: Article 504, Article 508, Article 514,

Cluster 9 articles: Article 275, Article 392,

C