In [39]:
import scipy
import numpy as np
import gensim
import pandas as pd
from sklearn import feature_extraction
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models

In [10]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\skamuf\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
df = pd.read_pickle("df_cleaned.pkl")

In [6]:
df.head()

Unnamed: 0,Article,Title,Paragraphs,References_internal_clean,Paragraphs_cleaned
1,Article 1,Scope,This Regulation lays down uniform rules concer...,[460],This Regulation lays down uniform rules concer...
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...,[],For the purposes of ensuring compliance with t...
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...,[],This Regulation shall not prevent institutions...
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ...","[4, 2, 115, 25, 71, 301, 113, 1]","1. For the purposes of this Regulation, the ..."
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ...",[],"For the purposes of Part Three, Title II, the ..."


In [7]:
paragraphs = df['Paragraphs_cleaned'].tolist()

In [8]:
tokenizer = RegexpTokenizer(r'\w+')

In [13]:
p_stemmer = PorterStemmer()

In [14]:
texts = []

In [15]:
for i in paragraphs:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [16]:
texts

[['regul',
  'lay',
  'uniform',
  'rule',
  'concern',
  'gener',
  'prudenti',
  'requir',
  'institut',
  'supervis',
  'direct',
  '2013',
  '36',
  'eu',
  'shall',
  'compli',
  'relat',
  'follow',
  'item',
  'fund',
  'requir',
  'relat',
  'entir',
  'quantifi',
  'uniform',
  'standardis',
  'element',
  'credit',
  'risk',
  'market',
  'risk',
  'oper',
  'risk',
  'settlement',
  'risk',
  'b',
  'requir',
  'limit',
  'larg',
  'exposur',
  'c',
  'deleg',
  'act',
  'refer',
  'enter',
  'forc',
  'liquid',
  'requir',
  'relat',
  'entir',
  'quantifi',
  'uniform',
  'standardis',
  'element',
  'liquid',
  'risk',
  'report',
  'requir',
  'relat',
  'point',
  'b',
  'c',
  'leverag',
  'e',
  'public',
  'disclosur',
  'requir',
  'regul',
  'govern',
  'public',
  'requir',
  'compet',
  'author',
  'field',
  'prudenti',
  'regul',
  'supervis',
  'institut',
  'set',
  'direct',
  '2013',
  '36',
  'eu'],
 ['purpos',
  'ensur',
  'complianc',
  'regul',
  'compe

In [17]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

In [18]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [20]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=100, id2word = dictionary, passes=20)

  diff = np.log(self.expElogbeta)


In [23]:
print(ldamodel)

LdaModel(num_terms=2442, num_topics=100, decay=0.5, chunksize=2000)


In [28]:
ldamodel.print_topics(10)

[(0,
  '0.041*"exposur" + 0.029*"mean" + 0.025*"set" + 0.024*"transact" + 0.021*"net" + 0.017*"institut" + 0.016*"counterparti" + 0.015*"market" + 0.013*"margin" + 0.013*"valu"'),
 (16,
  '0.051*"shall" + 0.045*"risk" + 0.036*"institut" + 0.024*"loss" + 0.016*"stress" + 0.015*"data" + 0.014*"test" + 0.012*"oper" + 0.011*"measur" + 0.011*"exposur"'),
 (98,
  '0.144*"exposur" + 0.089*"risk" + 0.069*"weight" + 0.058*"amount" + 0.052*"shall" + 0.048*"calcul" + 0.044*"accord" + 0.027*"institut" + 0.026*"loss" + 0.021*"expect"'),
 (31,
  '0.037*"institut" + 0.025*"shall" + 0.021*"asset" + 0.020*"liquid" + 0.014*"author" + 0.013*"compet" + 0.011*"paragraph" + 0.010*"set" + 0.010*"invest" + 0.010*"refer"'),
 (91,
  '0.049*"requir" + 0.038*"fund" + 0.028*"1" + 0.026*"institut" + 0.026*"articl" + 0.025*"regul" + 0.024*"calcul" + 0.024*"eu" + 0.024*"refer" + 0.023*"2013"'),
 (79,
  '0.057*"institut" + 0.040*"liquid" + 0.035*"sub" + 0.027*"part" + 0.026*"compet" + 0.026*"group" + 0.025*"author" + 

In [29]:
#lda_corpus = ldamodel[corpus]

In [34]:
lda_corpus = [max(prob,key=lambda y:y[1])
                for prob in ldamodel[corpus] ]
playlists = [[] for i in range(100)]
for i, x in enumerate(lda_corpus):
    playlists[x[0]].append(paragraphs[i])

In [35]:
playlists

[["1.\xa0\xa0\xa0Institutions that have not received permission to use own LGDs and own conversion factors for exposures to corporates, institutions or central governments and central banks shall assign to exposures arising from repurchase transactions or securities or commodities lending or borrowing transactions a maturity value (M) of 0,5 years and to all other exposures an M of 2,5 years. Alternatively, as part of the permission referred to in  competent authorities shall decide on whether the institution shall use maturity (M) for each exposure as set out under paragraph 2. 2.\xa0\xa0\xa0Institutions that have received the permission of the competent authority to use own LGDs and own conversion factors for exposures to corporates, institutions or central governments and central banks pursuant to  shall calculate M for each of these exposures as set out in points (a) to (e) of this paragraph and subject to paragraphs 3 to 5 of this  shall be no greater than five years except in the

In [36]:
docTopicProbMat = ldamodel.get_document_topics(corpus,minimum_probability=0)

In [37]:
listDocProb = list(docTopicProbMat)

In [41]:
probMatrix = np.zeros(shape=(len(df["Paragraphs"]),100))
for i,x in enumerate(listDocProb):      #each document i
    for t in x:     #each topic j
        probMatrix[i, t[0]] = t[1] 

In [42]:
matrix = pd.DataFrame(probMatrix)

In [46]:
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.000119,0.000119,0.000119,0.000119,0.000119,0.000119,0.000119,0.000119,0.000119,0.000119,...,0.000119,0.330982,0.000119,0.000119,0.000119,0.000119,0.000119,0.000119,0.000119,0.026353
1,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,...,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588,0.000588
2,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,...,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714
3,0.221598,0.000003,0.009640,0.000003,0.000003,0.016681,0.000003,0.000736,0.000003,0.020191,...,0.000003,0.019573,0.006125,0.006794,0.000003,0.002234,0.003722,0.034663,0.000003,0.026780
4,0.269140,0.000189,0.000189,0.000189,0.000189,0.000189,0.000189,0.000189,0.404372,0.000189,...,0.000189,0.000189,0.000189,0.000189,0.000189,0.000189,0.000189,0.000189,0.000189,0.000189
5,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078,...,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078,0.000078
6,0.000049,0.000049,0.000049,0.000049,0.000049,0.000049,0.000049,0.000049,0.000049,0.037508,...,0.000049,0.000049,0.000049,0.000049,0.000049,0.000049,0.000049,0.000049,0.000049,0.000049
7,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.048269,...,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031,0.000031
8,0.000089,0.000089,0.000089,0.000089,0.000089,0.000089,0.000089,0.000089,0.000089,0.000089,...,0.000089,0.370169,0.000089,0.000089,0.000089,0.000089,0.000089,0.110282,0.000089,0.000089
9,0.000081,0.039598,0.000081,0.000081,0.000081,0.000081,0.000081,0.000081,0.000081,0.294215,...,0.000081,0.000081,0.000081,0.082969,0.000081,0.000081,0.000081,0.000081,0.000081,0.000081


In [69]:
list1= [];
for i in range(0, len(df["Paragraphs"])):
    x = 0;
    for j in range(0, 99):
        if matrix[j][i] > x:
            x = matrix[j][i];
            z = j;
    list1.append(z);

In [70]:
results = [int(i) for i in list1]

In [71]:
se = pd.Series(results)

In [72]:
df['cluster'] = se.values

In [73]:
df

Unnamed: 0,Article,Title,Paragraphs,References_internal_clean,Paragraphs_cleaned,cluster
1,Article 1,Scope,This Regulation lays down uniform rules concer...,[460],This Regulation lays down uniform rules concer...,91
2,Article 2,Supervisory powers,For the purposes of ensuring compliance with t...,[],For the purposes of ensuring compliance with t...,48
3,Article 3,Application of stricter requirements by instit...,This Regulation shall not prevent institutions...,[],This Regulation shall not prevent institutions...,41
4,Article 4,Definitions,"1. For the purposes of this Regulation, the ...","[4, 2, 115, 25, 71, 301, 113, 1]","1. For the purposes of this Regulation, the ...",73
5,Article 5,Definitions specific to capital requirements f...,"For the purposes of Part Three, Title II, the ...",[],"For the purposes of Part Three, Title II, the ...",8
6,Article 6,General principles,1. Institutions shall comply with the obliga...,"[19, 89, 90, 91, 508, 95, 7, 96]",1. Institutions shall comply with the obliga...,10
7,Article 7,Derogation to the application of prudential re...,1. Competent authorities may waive the appli...,"[6, 11]",1. Competent authorities may waive the appli...,10
8,Article 8,Derogation to the application of liquidity req...,1. The competent authorities may waive in fu...,"[21, 113]",1. The competent authorities may waive in fu...,79
9,Article 9,Individual consolidation method,1. Subject to paragraphs 2 and 3 of this Art...,"[6, 7]",1. Subject to paragraphs 2 and 3 of this to...,91
10,Article 10,Waiver for credit institutions permanently aff...,"1. Competent authorities may, in accordance ...",[],"1. Competent authorities may, in accordance ...",9


In [74]:
def assigned_articles(number):
    res = []
    for article in df.ix[number]['Article'].values.tolist():
        res.append(article)
    return res

In [75]:
df['assigned_articles'] = df['cluster'].apply(assigned_articles)

AttributeError: 'str' object has no attribute 'values'