In [16]:
import scipy
import numpy
import gensim
import pandas as pd
import re
from sklearn import feature_extraction

In [4]:
df = pd.read_pickle("df_cleaned.pkl")

In [5]:
df.head()

Unnamed: 0,Article,Title,Paragraphs,References_internal_clean,Paragraphs_cleaned
1,Article 1,Scope,This Regulation lays down uniform rules concer...,[460],This Regulation lays down uniform rules concer...
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...,[],For the purposes of ensuring compliance with t...
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...,[],This Regulation shall not prevent institutions...
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ...","[4, 2, 115, 25, 71, 301, 113, 1]","1. For the purposes of this Regulation, the ..."
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ...",[],"For the purposes of Part Three, Title II, the ..."


In [7]:
paragraphs = df['Paragraphs_cleaned'].tolist()
articles = df['Article'].tolist()
internal_references = df['References_internal_clean'].tolist()

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(paragraphs) #fit the vectorizer to paragraphs

print(tfidf_matrix.shape)

(524, 59)


In [9]:
terms = tfidf_vectorizer.get_feature_names()

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print(dist)

[[ 0.00000000e+00  6.01156660e-01  6.04823963e-01 ...  6.52632556e-01
   7.26070350e-01  8.84983268e-01]
 [ 6.01156660e-01  0.00000000e+00  6.53937797e-01 ...  5.50781798e-01
   4.90104691e-01  8.86698911e-01]
 [ 6.04823963e-01  6.53937797e-01  0.00000000e+00 ...  5.36497097e-01
   7.69091088e-01  9.18418571e-01]
 ...
 [ 6.52632556e-01  5.50781798e-01  5.36497097e-01 ... -2.22044605e-16
   4.88642044e-01  6.79274471e-01]
 [ 7.26070350e-01  4.90104691e-01  7.69091088e-01 ...  4.88642044e-01
  -2.22044605e-16  4.68592992e-01]
 [ 8.84983268e-01  8.86698911e-01  9.18418571e-01 ...  6.79274471e-01
   4.68592992e-01 -2.22044605e-16]]


In [11]:
#K-means clustering
from sklearn.cluster import KMeans
num_clusters = 100
km = KMeans(n_clusters=num_clusters)
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

Wall time: 2.17 s


In [12]:
crr_articles = { 'article': articles, 'paragraph': paragraphs, 'cluster': clusters, 'internal_references': internal_references}

frame = pd.DataFrame(crr_articles, index = [clusters], columns = ['article', 'cluster', 'internal_references'])

In [24]:
def assigned_articles(number):
    res = []
    for article in frame.ix[number]['article'].values.tolist():
        res.append(article)
    return res

In [25]:
frame['assigned_articles'] = frame['cluster'].apply(assigned_articles)

In [26]:
frame.head()

Unnamed: 0,article,cluster,internal_references,assigned_articles
21,Article 1,21,[460],"[Article 1, Article 364, Article 397, Article ..."
39,Article 2,39,[],"[Article 2, Article 18, Article 20, Article 21..."
35,Article 3,35,[],"[Article 3, Article 135, Article 463, Article ..."
23,Article 4,23,"[4, 2, 115, 25, 71, 301, 113, 1]","[Article 4, Article 19, Article 49, Article 396]"
59,Article 5,59,[],"[Article 5, Article 192]"


In [28]:
def clean_references(text2):
    res2 = re.findall('\d+(?!\))', text2)
    return res2

In [29]:
frame['assigned_articles'] = frame['assigned_articles'].astype(str)
frame['assigned_articles'] = frame['assigned_articles'].apply(clean_references)

In [30]:
frame.head()

Unnamed: 0,article,cluster,internal_references,assigned_articles
21,Article 1,21,[460],"[1, 364, 397, 445, 457]"
39,Article 2,39,[],"[2, 18, 20, 21, 76, 100, 415, 430]"
35,Article 3,35,[],"[3, 135, 463, 513, 515]"
23,Article 4,23,"[4, 2, 115, 25, 71, 301, 113, 1]","[4, 19, 49, 396]"
59,Article 5,59,[],"[5, 192]"


In [31]:
def count_references(references):
    references_count = 0
    for i in range(1, len(frame)):
        i = str(i)
        if i in references:
            references_count +=1
    return references_count

In [32]:
frame['references_count'] = frame['internal_references'].apply(count_references).astype(int)

In [33]:
frame['assigned_articles_count'] = frame['assigned_articles'].apply(count_references).astype(int)

In [34]:
frame['correctly_labeled'] = [len(set(a).intersection(b)) for a, b in zip(frame['internal_references'], frame['assigned_articles'])]

In [36]:
frame['precision'] = (frame['correctly_labeled']/ frame['assigned_articles_count'])

In [37]:
frame['recall'] = (frame['correctly_labeled']/ frame['references_count'])

In [39]:
frame.head(10)

Unnamed: 0,article,cluster,internal_references,assigned_articles,references_count,assigned_articles_count,correctly_labeled,precision,recall
21,Article 1,21,[460],"[1, 364, 397, 445, 457]",1,5,0,0.0,0.0
39,Article 2,39,[],"[2, 18, 20, 21, 76, 100, 415, 430]",0,8,0,0.0,
35,Article 3,35,[],"[3, 135, 463, 513, 515]",0,5,0,0.0,
23,Article 4,23,"[4, 2, 115, 25, 71, 301, 113, 1]","[4, 19, 49, 396]",8,4,1,0.25,0.125
59,Article 5,59,[],"[5, 192]",0,2,0,0.0,
44,Article 6,44,"[19, 89, 90, 91, 508, 95, 7, 96]","[6, 16, 459]",8,3,0,0.0,0.0
17,Article 7,17,"[6, 11]","[7, 8, 9, 10, 27, 146, 148, 283, 295, 296, 312...",2,15,0,0.0,0.0
17,Article 8,17,"[21, 113]","[7, 8, 9, 10, 27, 146, 148, 283, 295, 296, 312...",2,15,0,0.0,0.0
17,Article 9,17,"[6, 7]","[7, 8, 9, 10, 27, 146, 148, 283, 295, 296, 312...",2,15,1,0.066667,0.5
17,Article 10,17,[],"[7, 8, 9, 10, 27, 146, 148, 283, 295, 296, 312...",0,15,0,0.0,


In [40]:
avg_precision = frame['precision'].mean()
avg_recall = frame['recall'].mean()

print('Average Precision: ')
print(avg_precision)
print('Average Recall: ')
print(avg_recall)

Average Precision: 
0.03007732036835088
Average Recall: 
0.11513755646921477


In [41]:
from __future__ import print_function

#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d articles:" % i, end='')
    for article in frame.ix[i]['article'].values.tolist():
        print(' %s,' % article, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Cluster 0 articles: Article 63, Article 77,

Cluster 1 articles: Article 174, Article 175, Article 179, Article 180, Article 190, Article 376, Article 446,

Cluster 2 articles: Article 33, Article 198, Article 229, Article 366,

Cluster 3 articles: Article 24, Article 466, Article 479, Article 481, Article 487,

Cluster 4 articles: Article 247, Article 384, Article 389, Article 399, Article 500,

Cluster 5 articles: Article 92, Article 163, Article 291, Article 293, Article 323, Article 367, Article 372, Article 407, Article 454,

Cluster 6 articles: Article 112, Article 172, Article 176, Article 302, Article 387, Article 398, Article 409, Article 447,

Cluster 7 articles: Article 103, Article 338, Article 345, Article 357, Article 359, Article 374,

Cluster 8 articles: Article 14,

Cluster 9 articles: Article 32, Article 41, Article 304, Article 341,

Cluster 10 articles: Article 25, Article 86, Article 494,

Cluster 11 articles: Article 199, Article 211, Article 227,

Cluster 12 arti