In [1]:
import pandas as pd

In [2]:
df = pd.read_pickle("df_cleaned_links.pkl")

In [4]:
paragraphs = df['Paragraphs_cleaned'].tolist()
articles = df['Article'].tolist()
internal_references = df['References_internal_clean'].tolist()

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(paragraphs) #fit the vectorizer to paragraphs

print(tfidf_matrix.shape)

(524, 59)


In [6]:
npm_tfidf = tfidf_matrix.todense()

In [144]:
terms = tfidf_vectorizer.get_feature_names()

In [7]:
#TF-idf bigram
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer_bigram = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, ngram_range=(2,2))

tfidf_matrix_bigram = tfidf_vectorizer_bigram.fit_transform(paragraphs) #fit the vectorizer to paragraphs

npm_tfidf_bigram = tfidf_matrix_bigram.todense()

In [8]:
#TF-idf trigram
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer_trigram = TfidfVectorizer(max_df=0.9, max_features=200000,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, ngram_range=(3,3))

tfidf_matrix_trigram = tfidf_vectorizer_trigram.fit_transform(paragraphs) #fit the vectorizer to paragraphs

npm_tfidf_trigram = tfidf_matrix_trigram.todense()

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
count_vectorizer_matrix = count_vectorizer.fit_transform(paragraphs)

count_vectorizer = count_vectorizer_matrix.todense()

In [10]:
import scipy
import numpy as np
import gensim
import re
import pandas as pd
from sklearn import feature_extraction
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models

In [11]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sophiekamuf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
paragraphs = df['Paragraphs_cleaned'].tolist()

In [13]:
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()
texts = []

In [14]:
for i in paragraphs:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [15]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [16]:
# generate LDA model 100 topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=100, id2word = dictionary, passes=20)

In [17]:
lda_corpus = [max(prob,key=lambda y:y[1])
                for prob in ldamodel[corpus] ]
playlists = [[] for i in range(100)]
for i, x in enumerate(lda_corpus):
    playlists[x[0]].append(paragraphs[i])

In [18]:
docTopicProbMat = ldamodel.get_document_topics(corpus,minimum_probability=0)
listDocProb = list(docTopicProbMat)

In [19]:
probMatrix = np.zeros(shape=(len(df["Paragraphs"]),100))
for i,x in enumerate(listDocProb):      #each document i
    for t in x:     #each topic j
        probMatrix[i, t[0]] = t[1] 

In [20]:
matrix100 = pd.DataFrame(probMatrix)

In [21]:
# generate LDA model 200 topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=200, id2word = dictionary, passes=20)

In [22]:
lda_corpus = [max(prob,key=lambda y:y[1])
                for prob in ldamodel[corpus] ]
playlists = [[] for i in range(200)]
for i, x in enumerate(lda_corpus):
    playlists[x[0]].append(paragraphs[i])
    
docTopicProbMat = ldamodel.get_document_topics(corpus,minimum_probability=0)
listDocProb = list(docTopicProbMat)

probMatrix = np.zeros(shape=(len(df["Paragraphs"]),200))
for i,x in enumerate(listDocProb):      #each document i
    for t in x:     #each topic j
        probMatrix[i, t[0]] = t[1] 
        
matrix200 = pd.DataFrame(probMatrix)

In [23]:
# generate LDA model 50 topics
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=50, id2word = dictionary, passes=20)

In [24]:
lda_corpus = [max(prob,key=lambda y:y[1])
                for prob in ldamodel[corpus] ]
playlists = [[] for i in range(50)]
for i, x in enumerate(lda_corpus):
    playlists[x[0]].append(paragraphs[i])
    
docTopicProbMat = ldamodel.get_document_topics(corpus,minimum_probability=0)
listDocProb = list(docTopicProbMat)

probMatrix = np.zeros(shape=(len(df["Paragraphs"]),50))
for i,x in enumerate(listDocProb):      #each document i
    for t in x:     #each topic j
        probMatrix[i, t[0]] = t[1] 
        
matrix50 = pd.DataFrame(probMatrix)

In [89]:
articles = list()
paragraphs = list()
link = list()
tfidf = list()
tfidf_bigram = list()
tfidf_trigram = list()
count_vec = list()
lda_50topics = list()
lda_100topics = list()
lda_200topics = list()


for x in range (1, len(df)):
        articles.append(df['Article'][x])
        paragraphs.append(df['Paragraphs_cleaned'][x])
        link.append(df['Link'][x])
        tfidf.append(npm_tfidf[x-1].tolist()[0])
        tfidf_bigram.append(npm_tfidf_bigram[x-1].tolist()[0])
        tfidf_trigram.append(npm_tfidf_trigram[x-1].tolist()[0])
        count_vec.append(count_vectorizer[x-1].tolist()[0])
        lda_50topics.append(matrix50[(x-1):x].values.tolist()[0])
        lda_100topics.append(matrix100[(x-1):x].values.tolist()[0])
        lda_200topics.append(matrix200[(x-1):x].values.tolist()[0])

In [95]:
crr_articles2 = { 'articles': articles,'paragraphs': paragraphs, 'link': link, 'tfidf': tfidf}

frame2 = pd.DataFrame(crr_articles2, columns = ['articles', 'paragraphs', 'link',  'tfidf'])

In [148]:
frame2

Unnamed: 0,articles,paragraphs,link,tfidf
0,Article 1,This Regulation lays down uniform rules concer...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11003925..."
1,Article 2,For the purposes of ensuring compliance with t...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.37334782..."
2,Article 3,This Regulation shall not prevent institutions...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Article 4,"1. For the purposes of this Regulation, the ...",1,"[0.12488787918320336, 0.05709706285484591, 0.0..."
4,Article 5,"For the purposes of Part Three, Title II, the ...",0,"[0.0, 0.0, 0.0, 0.0, 0.31599520554972177, 0.0,..."
5,Article 6,1. Institutions shall comply with the obliga...,1,"[0.07344805553565283, 0.12312470721749982, 0.0..."
6,Article 7,1. Competent authorities may waive the appli...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.31045806..."
7,Article 8,1. The competent authorities may waive in fu...,1,"[0.0, 0.0, 0.11443928865201265, 0.0, 0.0956361..."
8,Article 9,1. Subject to paragraphs 2 and 3 of this to...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1144967085015..."
9,Article 10,"1. Competent authorities may, in accordance ...",0,"[0.0854840773389406, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [100]:
frame3 = frame2['tfidf'].apply(pd.Series)

In [105]:
frame3['articles'] = articles
frame3['link'] = link

In [106]:
frame3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,articles,link
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.110039,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Article 1,1
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.373348,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Article 2,0
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Article 3,0
3,0.124888,0.057097,0.018073,0.132917,0.007552,0.028447,0.341359,0.067431,0.090819,0.000000,...,0.009221,0.018634,0.107438,0.000000,0.007822,0.028048,0.071383,0.000000,Article 4,1
4,0.000000,0.000000,0.000000,0.000000,0.315995,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.385838,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Article 5,0
5,0.073448,0.123125,0.000000,0.000000,0.000000,0.000000,0.000000,0.193878,0.534117,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,Article 6,1
6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.310458,0.171057,0.000000,...,0.000000,0.096515,0.238495,0.000000,0.081030,0.000000,0.000000,0.000000,Article 7,1
7,0.000000,0.000000,0.114439,0.000000,0.095636,0.060040,0.180121,0.379524,0.313668,0.000000,...,0.000000,0.000000,0.097184,0.057602,0.000000,0.000000,0.000000,0.000000,Article 8,1
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.114497,0.452344,0.199388,0.000000,...,0.000000,0.000000,0.092665,0.000000,0.094451,0.000000,0.000000,0.000000,Article 9,1
9,0.085484,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.338473,0.248658,0.000000,...,0.000000,0.000000,0.000000,0.136991,0.117790,0.000000,0.000000,0.000000,Article 10,0


In [26]:
crr_articles = { 'articles': articles,'paragraphs': paragraphs, 'link': link, 'tfidf': tfidf, 'tfidf_bigram': tfidf_bigram, 'tfidf_trigram': tfidf_trigram, 
                'count_vec': count_vec, 
                'lda_50topics': lda_50topics, 'lda_100topics': lda_100topics, 'lda_200topics': lda_200topics}

frame = pd.DataFrame(crr_articles, columns = ['articles', 'paragraphs', 'link',  'tfidf', 'tfidf_bigram', 'tfidf_trigram', 'count_vec', 'lda_50topics', 'lda_100topics', 'lda_200topics'])

In [44]:
frame['tfidf'] = frame['tfidf'].apply(flatten1)
frame['tfidf_bigram'] = frame['tfidf_bigram'].apply(flatten1)
frame['tfidf_trigram'] = frame['tfidf_trigram'].apply(flatten1)
frame['count_vec'] = frame['count_vec'].apply(flatten1)
frame['lda_50topics'] = frame['lda_50topics'].apply(flatten1)
frame['lda_100topics'] = frame['lda_100topics'].apply(flatten1)
frame['lda_200topics'] = frame['lda_200topics'].apply(flatten1)

In [45]:
feature_headers = ['tfidf', 'tfidf_bigram', 'tfidf_trigram', 'count_vec', 'lda_50topics', 'lda_100topics', 'lda_200topics']
target_header = ['link']

In [117]:
feature_headers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
50, 51, 52, 53, 54, 55, 56, 57, 58]
target_header = ['link']

In [118]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(frame3[feature_headers], frame3[target_header],
                                                        train_size=0.7)



In [119]:
print(test_y.apply(pd.value_counts))
#test_y.groupby('label').count()
print(train_y.apply(pd.value_counts))

   link
1   125
0    32
   link
1   272
0    94


In [120]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import datasets
import numpy as np
import pandas as pd

In [121]:
# Initializing Classifiers
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = SVC()

In [122]:
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [124]:
clf1.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [125]:
clf2.fit(train_x, train_y)

  """Entry point for launching an IPython kernel.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [126]:
clf3.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


GaussianNB(priors=None)

In [127]:
clf4.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [129]:
predictions1 = clf1.predict(test_x)

In [130]:
print('Logistic Regression:')
print('Accuracy:', accuracy_score(test_y, predictions1))
print('F1 score:', f1_score(test_y, predictions1))
print('Recall:', recall_score(test_y, predictions1))
print('Precision:', precision_score(test_y, predictions1))

Logistic Regression:
Accuracy: 0.7961783439490446
F1 score: 0.8840579710144927
Recall: 0.976
Precision: 0.8079470198675497


In [132]:
predictions2 = clf2.predict(test_x)
print('Random Forest:')
print('Accuracy:', accuracy_score(test_y, predictions2))
print('F1 score:', f1_score(test_y, predictions2))
print('Recall:', recall_score(test_y, predictions2))
print('Precision:', precision_score(test_y, predictions2))

Random Forest:
Accuracy: 0.8280254777070064
F1 score: 0.8941176470588236
Recall: 0.912
Precision: 0.8769230769230769


In [133]:
predictions3 = clf3.predict(test_x)
print('GaussianNB:')
print('Accuracy:', accuracy_score(test_y, predictions3))
print('F1 score:', f1_score(test_y, predictions3))
print('Recall:', recall_score(test_y, predictions3))
print('Precision:', precision_score(test_y, predictions3))

GaussianNB:
Accuracy: 0.6369426751592356
F1 score: 0.7443946188340808
Recall: 0.664
Precision: 0.8469387755102041


In [134]:
predictions4 = clf4.predict(test_x)
print('SVM:')
print('Accuracy:', accuracy_score(test_y, predictions4))
print('F1 score:', f1_score(test_y, predictions4))
print('Recall:', recall_score(test_y, predictions4))
print('Precision:', precision_score(test_y, predictions4))

SVM:
Accuracy: 0.7961783439490446
F1 score: 0.8865248226950355
Recall: 1.0
Precision: 0.7961783439490446


In [40]:
train_x['tfidf'] = train_x['tfidf'].apply(flatten1)
train_x['tfidf_bigram'] = train_x['tfidf_bigram'].apply(flatten1)
train_x['tfidf_trigram'] = train_x['tfidf_trigram'].apply(flatten1)
train_x['count_vec'] = train_x['count_vec'].apply(flatten1)
train_x['lda_50topics'] = train_x['lda_50topics'].apply(flatten1)
train_x['lda_100topics'] = train_x['lda_100topics'].apply(flatten1)
train_x['lda_200topics'] = train_x['lda_200topics'].apply(flatten1)

In [35]:
def flatten1(list_of_lists):
    flattened_list = []
    for x in list_of_lists:
            flattened_list.append(x)
    return flattened_list

In [137]:
predictions2

array([1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1])

In [140]:
test_x['predictions_rf'] = predictions2

In [141]:
test_x

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,50,51,52,53,54,55,56,57,58,predictions_rf
455,0.536653,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.155201,0.000000,0.088182,...,0.252993,0.000000,0.087569,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
405,0.000000,0.000000,0.156861,0.066843,0.335710,0.000000,0.066315,0.146154,0.000000,0.000000,...,0.059561,0.081616,0.082464,0.000000,0.080519,0.000000,0.000000,0.140404,0.000000,1
273,0.000000,0.173648,0.170274,0.145117,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.129309,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.304819,0.000000,1
285,0.198586,0.000000,0.000000,0.000000,0.065959,0.000000,0.000000,0.000000,0.063281,0.000000,...,0.093619,0.000000,0.000000,0.000000,0.000000,0.108822,0.195104,0.000000,0.000000,1
511,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
71,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
151,0.000000,0.241450,0.000000,0.057651,0.000000,0.000000,0.057196,0.000000,0.138894,0.035811,...,0.179798,0.070393,0.106686,0.146460,0.034724,0.029857,0.000000,0.000000,0.142249,1
275,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.097103,0.100145,...,0.574626,0.000000,0.000000,0.000000,0.000000,0.083493,0.099795,0.507962,0.000000,1
516,0.000000,0.000000,0.000000,0.000000,0.546765,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
177,0.000000,0.000000,0.000000,0.176514,0.055408,0.055408,0.087560,0.000000,0.000000,0.000000,...,0.039322,0.107764,0.163325,0.000000,0.106316,0.045707,0.000000,0.000000,0.000000,1


In [142]:
test_x_label1 = test_x[test_x.predictions_rf == 1]

In [143]:
test_x_label1

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,50,51,52,53,54,55,56,57,58,predictions_rf
455,0.536653,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.155201,0.000000,0.088182,...,0.252993,0.000000,0.087569,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
405,0.000000,0.000000,0.156861,0.066843,0.335710,0.000000,0.066315,0.146154,0.000000,0.000000,...,0.059561,0.081616,0.082464,0.000000,0.080519,0.000000,0.000000,0.140404,0.000000,1
273,0.000000,0.173648,0.170274,0.145117,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.129309,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.304819,0.000000,1
285,0.198586,0.000000,0.000000,0.000000,0.065959,0.000000,0.000000,0.000000,0.063281,0.000000,...,0.093619,0.000000,0.000000,0.000000,0.000000,0.108822,0.195104,0.000000,0.000000,1
151,0.000000,0.241450,0.000000,0.057651,0.000000,0.000000,0.057196,0.000000,0.138894,0.035811,...,0.179798,0.070393,0.106686,0.146460,0.034724,0.029857,0.000000,0.000000,0.142249,1
275,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.097103,0.100145,...,0.574626,0.000000,0.000000,0.000000,0.000000,0.083493,0.099795,0.507962,0.000000,1
177,0.000000,0.000000,0.000000,0.176514,0.055408,0.055408,0.087560,0.000000,0.000000,0.000000,...,0.039322,0.107764,0.163325,0.000000,0.106316,0.045707,0.000000,0.000000,0.000000,1
422,0.000000,0.000000,0.079570,0.000000,0.000000,0.000000,0.201834,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.250986,0.137822,0.081689,0.000000,0.000000,0.071222,0.000000,1
476,0.000000,0.076014,0.149075,0.254098,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.232695,0.000000,0.129105,0.000000,0.000000,0.000000,0.066717,0.078371,1
507,0.000000,0.000000,0.000000,0.139007,0.698151,0.000000,0.137910,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1


In [152]:
test_x_label1['paragraphs'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [None]:
index = test_x_label1.index.tolist()

In [215]:
paragraphs2 = []
for x in index:
    paragraphs2.append(paragraphs[x-1])

In [216]:
test_x_label1['paragraphs'] = paragraphs2

#References_internal_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [217]:
references_internal = []
for x in index:
    references_internal.append(internal_references[x-1])

In [218]:
test_x_label1['references_internal'] = references_internal

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [219]:
articles2 = []
for x in index:
    articles2.append(articles[x-1])
test_x_label1['articles'] = articles2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [220]:
test_x_label1

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,53,54,55,56,57,58,predictions_rf,paragraphs,references_internal,articles
455,0.536653,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.155201,0.000000,0.088182,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,Institutions calculating their capital require...,"[363, 104, 105, 364]",Article 455
405,0.000000,0.000000,0.156861,0.066843,0.335710,0.000000,0.066315,0.146154,0.000000,0.000000,...,0.000000,0.080519,0.000000,0.000000,0.140404,0.000000,1,"1. An institution, other than when acting as...","[408, 409]",Article 405
273,0.000000,0.173648,0.170274,0.145117,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.304819,0.000000,1,1. Institutions shall determine the exposure...,"[94, 282, 283, 233, 153, 183, 143, 299, 33, 291]",Article 273
285,0.198586,0.000000,0.000000,0.000000,0.065959,0.000000,0.000000,0.000000,0.063281,0.000000,...,0.000000,0.000000,0.108822,0.195104,0.000000,0.000000,1,1. If the netting set is subject to a margin...,[284],Article 285
151,0.000000,0.241450,0.000000,0.057651,0.000000,0.000000,0.057196,0.000000,0.138894,0.035811,...,0.146460,0.034724,0.029857,0.000000,0.000000,0.142249,1,1. The risk-weighted exposure amounts for cr...,"[157, 152, 147, 155, 153, 143, 161, 166, 1472,...",Article 151
275,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.097103,0.100145,...,0.000000,0.000000,0.083493,0.099795,0.507962,0.000000,1,1. The exposure value is the notional amount...,[],Article 275
177,0.000000,0.000000,0.000000,0.176514,0.055408,0.055408,0.087560,0.000000,0.000000,0.000000,...,0.000000,0.106316,0.045707,0.000000,0.000000,0.000000,1,1. An institution shall have in place sound ...,[153],Article 177
422,0.000000,0.000000,0.079570,0.000000,0.000000,0.000000,0.201834,0.000000,0.000000,0.000000,...,0.137822,0.081689,0.000000,0.000000,0.071222,0.000000,1,1. Institutions shall multiply liabilities r...,"[192, 418, 416, 113, 400, 425, 20, 460, 10]",Article 422
476,0.000000,0.076014,0.149075,0.254098,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.129105,0.000000,0.000000,0.000000,0.066717,0.078371,1,By way of derogation from the period from 1 J...,"[66, 477]",Article 476
507,0.000000,0.000000,0.000000,0.139007,0.698151,0.000000,0.137910,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,"By 31 December 2015, the Commission shall revi...",[400],Article 507


In [222]:
paragraphs3 = test_x_label1['paragraphs'].tolist()
articles3 = test_x_label1['articles'].tolist()
internal_references3 = test_x_label1['references_internal'].tolist()

In [223]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(paragraphs3) #fit the vectorizer to paragraphs

print(tfidf_matrix.shape)

(130, 82)


In [225]:
#K-means clustering
from sklearn.cluster import KMeans
num_clusters = 30
km = KMeans(n_clusters=num_clusters)
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

CPU times: user 302 ms, sys: 3.16 ms, total: 305 ms
Wall time: 305 ms


In [226]:
crr_articles = { 'article': articles3, 'paragraph': paragraphs3, 'cluster': clusters, 'internal_references': internal_references3}

frame3 = pd.DataFrame(crr_articles, index = [clusters], columns = ['article', 'cluster', 'internal_references'])

In [229]:
def assigned_articles(number):
    res = []
    for article in frame3.ix[number]['article'].values.tolist():
        res.append(article)
    return res

In [230]:
frame3['assigned_articles'] = frame3['cluster'].apply(assigned_articles)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [232]:
frame3.head()

Unnamed: 0,article,cluster,internal_references,assigned_articles
28,Article 455,28,"[363, 104, 105, 364]","[Article 455, Article 180, Article 188]"
9,Article 405,9,"[408, 409]","[Article 405, Article 142, Article 12, Article..."
26,Article 273,26,"[94, 282, 283, 233, 153, 183, 143, 299, 33, 291]","[Article 273, Article 207, Article 111, Articl..."
12,Article 285,12,[284],"[Article 285, Article 263, Article 149, Articl..."
8,Article 151,8,"[157, 152, 147, 155, 153, 143, 161, 166, 1472,...","[Article 151, Article 250, Article 246, Articl..."


In [233]:
def clean_references(text2):
    res2 = re.findall('\d+(?!\))', text2)
    return res2

In [234]:
frame3['assigned_articles'] = frame3['assigned_articles'].astype(str)
frame3['assigned_articles'] = frame3['assigned_articles'].apply(clean_references)

In [235]:
frame3.head()

Unnamed: 0,article,cluster,internal_references,assigned_articles
28,Article 455,28,"[363, 104, 105, 364]","[455, 180, 188]"
9,Article 405,9,"[408, 409]","[405, 142, 12, 118, 450]"
26,Article 273,26,"[94, 282, 283, 233, 153, 183, 143, 299, 33, 291]","[273, 207, 111, 383]"
12,Article 285,12,[284],"[285, 263, 149, 7, 363, 221]"
8,Article 151,8,"[157, 152, 147, 155, 153, 143, 161, 166, 1472,...","[151, 250, 246, 157, 241, 155, 109, 92]"


In [236]:
def count_references(references):
    references_count = 0
    for i in range(1, len(frame)):
        i = str(i)
        if i in references:
            references_count +=1
    return references_count

In [237]:
frame3['references_count'] = frame3['internal_references'].apply(count_references).astype(int)

In [238]:
frame3['assigned_articles_count'] = frame3['assigned_articles'].apply(count_references).astype(int)

In [239]:
frame3['correctly_labeled'] = [len(set(a).intersection(b)) for a, b in zip(frame3['internal_references'], frame3['assigned_articles'])]

In [240]:
frame3['precision'] = (frame3['correctly_labeled']/ frame3['assigned_articles_count'])

In [241]:
frame3['recall'] = (frame3['correctly_labeled']/ frame3['references_count'])

In [243]:
frame3.head(10)

Unnamed: 0,article,cluster,internal_references,assigned_articles,references_count,assigned_articles_count,correctly_labeled,precision,recall
28,Article 455,28,"[363, 104, 105, 364]","[455, 180, 188]",4,3,0,0.0,0.0
9,Article 405,9,"[408, 409]","[405, 142, 12, 118, 450]",2,5,0,0.0,0.0
26,Article 273,26,"[94, 282, 283, 233, 153, 183, 143, 299, 33, 291]","[273, 207, 111, 383]",10,4,0,0.0,0.0
12,Article 285,12,[284],"[285, 263, 149, 7, 363, 221]",1,6,0,0.0,0.0
8,Article 151,8,"[157, 152, 147, 155, 153, 143, 161, 166, 1472,...","[151, 250, 246, 157, 241, 155, 109, 92]",9,8,2,0.25,0.222222
7,Article 275,7,[],"[275, 236]",0,2,0,0.0,
5,Article 177,5,[153],"[177, 345, 322, 61, 297, 378, 308]",1,7,0,0.0,0.0
2,Article 422,2,"[192, 418, 416, 113, 400, 425, 20, 460, 10]","[422, 39, 35, 41, 420, 416]",9,6,1,0.166667,0.111111
13,Article 476,13,"[66, 477]","[476, 474, 490, 469]",2,4,0,0.0,0.0
0,Article 507,0,[400],"[507, 503, 505]",1,3,0,0.0,0.0


In [245]:
avg_precision = frame3['precision'].mean()
avg_recall = frame3['recall'].mean()

print('Average Precision: ')
print(avg_precision)
print('Average Recall: ')
print(avg_recall)

Average Precision: 
0.016923076923076926
Average Recall: 
0.0452156334231806
