#### Topic Modelling 

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import random
from pprint import pprint

import gensim
from gensim import corpora, models
from gensim.models import LdaModel, LdaMulticore
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet', quiet=True)

True

In [2]:
corpus = pd.read_csv('corpus.csv')
corpus = corpus.drop_duplicates()
corpus.shape

(4911, 2)

In [3]:
corpus.head()

Unnamed: 0,Section_ID,Section_Text
0,Cite-1,authority. this part is issued pursuant to 12 ...
1,Cite-2,purpose this part prescribes standards under w...
2,Cite-3,scope. the standards set forth in this part ap...
3,Cite-4,reservation of authority. the occ may determin...
4,Cite-43,obligation issued by an obligor not possessing...


#### cleaning and lemmatizing docs

In [4]:
stemmer = SnowballStemmer("english")

In [5]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

def clean_text(text):
    text =  re.sub(r"[^a-z0-9]", " ", text.lower())   
    return text

In [6]:
preprocessed = corpus.Section_Text.apply(clean_text).apply(preprocess)

In [7]:
preprocessed[:5]

0                    [author, issu, pursuant, seventh]
1    [purpos, prescrib, standard, nation, bank, pur...
2    [scope, standard, forth, appli, nation, bank, ...
3    [reserv, author, determin, case, case, basi, n...
4    [oblig, issu, obligor, possess, general, power...
Name: Section_Text, dtype: object

In [8]:
corpus.Section_Text[:5]

0    authority. this part is issued pursuant to 12 ...
1    purpose this part prescribes standards under w...
2    scope. the standards set forth in this part ap...
3    reservation of authority. the occ may determin...
4    obligation issued by an obligor not possessing...
Name: Section_Text, dtype: object

#### Creating bow corpus

In [9]:
dictionary = gensim.corpora.Dictionary(preprocessed)

In [10]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [11]:
# creating bag-of-word corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed]

In [12]:
# bow document example
index = 4310
ex = bow_corpus[index]
for i in range(len(ex)):
    print("Word {} (\"{}\"),  Count - {}.".format(ex[i][0], dictionary[ex[i][0]], ex[i][1]))

Word 1 ("issu"),  Count - 2.
Word 4 ("bank"),  Count - 7.
Word 6 ("contain"),  Count - 1.
Word 9 ("nation"),  Count - 5.
Word 22 ("appli"),  Count - 5.
Word 27 ("feder"),  Count - 5.
Word 36 ("state"),  Count - 3.
Word 38 ("subject"),  Count - 1.
Word 49 ("determin"),  Count - 2.
Word 56 ("paragraph"),  Count - 1.
Word 63 ("section"),  Count - 6.
Word 67 ("general"),  Count - 2.
Word 76 ("requir"),  Count - 3.
Word 85 ("applic"),  Count - 9.
Word 91 ("combin"),  Count - 1.
Word 94 ("credit"),  Count - 1.
Word 104 ("follow"),  Count - 1.
Word 121 ("provis"),  Count - 1.
Word 122 ("public"),  Count - 4.
Word 128 ("statutori"),  Count - 1.
Word 156 ("shall"),  Count - 1.
Word 193 ("institut"),  Count - 1.
Word 214 ("rule"),  Count - 1.
Word 236 ("organ"),  Count - 2.
Word 240 ("busi"),  Count - 1.
Word 255 ("relat"),  Count - 1.
Word 271 ("cover"),  Count - 1.
Word 274 ("compani"),  Count - 1.
Word 291 ("legal"),  Count - 2.
Word 297 ("separ"),  Count - 1.
Word 305 ("conclud"),  Count - 2

#### creaitng TF-IDF corpus

In [13]:
tfidf = models.TfidfModel(bow_corpus)

In [15]:
tfidf_corpus = tfidf[bow_corpus]

In [17]:
ex = tfidf_corpus[index]
for i in range(len(ex)):
    print("Word {} (\"{}\"),  TF-IDF - {}.".format(ex[i][0], dictionary[ex[i][0]], ex[i][1]))

Word 1 ("issu"),  TF-IDF - 0.08045598595140308.
Word 4 ("bank"),  TF-IDF - 0.09896513502771173.
Word 6 ("contain"),  TF-IDF - 0.05877374788891292.
Word 9 ("nation"),  TF-IDF - 0.13564750133198109.
Word 22 ("appli"),  TF-IDF - 0.19975940808941667.
Word 27 ("feder"),  TF-IDF - 0.10374349398314156.
Word 36 ("state"),  TF-IDF - 0.10351053252792156.
Word 38 ("subject"),  TF-IDF - 0.03823172690952112.
Word 49 ("determin"),  TF-IDF - 0.06782336468808096.
Word 56 ("paragraph"),  TF-IDF - 0.0276611726337707.
Word 63 ("section"),  TF-IDF - 0.10869137589313682.
Word 67 ("general"),  TF-IDF - 0.0785855822653381.
Word 76 ("requir"),  TF-IDF - 0.06343088439992764.
Word 85 ("applic"),  TF-IDF - 0.2779455339514064.
Word 91 ("combin"),  TF-IDF - 0.0779753402236377.
Word 94 ("credit"),  TF-IDF - 0.03482970439821828.
Word 104 ("follow"),  TF-IDF - 0.03730462288261047.
Word 121 ("provis"),  TF-IDF - 0.04982873762762001.
Word 122 ("public"),  TF-IDF - 0.19851449852946293.
Word 128 ("statutori"),  TF-IDF - 

#### Topic Modelling using BOW  

In [18]:
lda_bow = LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=16)

In [19]:
for idx, topic in lda_bow.print_topics(-1):
    print('Topic: {} \nWords: {}\n'.format(idx, topic))

Topic: 0 
Words: 0.024*"bank" + 0.016*"section" + 0.016*"paragraph" + 0.015*"requir" + 0.014*"institut" + 0.014*"associ" + 0.012*"exposur" + 0.012*"save" + 0.012*"notic" + 0.012*"provid"

Topic: 1 
Words: 0.027*"feder" + 0.026*"bank" + 0.020*"associ" + 0.015*"institut" + 0.015*"requir" + 0.015*"nation" + 0.014*"board" + 0.014*"section" + 0.013*"save" + 0.013*"regul"

Topic: 2 
Words: 0.022*"insur" + 0.021*"save" + 0.021*"bank" + 0.020*"associ" + 0.018*"agreement" + 0.014*"section" + 0.013*"paragraph" + 0.013*"secur" + 0.013*"swap" + 0.012*"cover"

Topic: 3 
Words: 0.024*"institut" + 0.024*"section" + 0.023*"regul" + 0.022*"board" + 0.020*"transact" + 0.019*"requir" + 0.016*"bank" + 0.016*"risk" + 0.015*"paragraph" + 0.015*"securit"

Topic: 4 
Words: 0.044*"loan" + 0.015*"properti" + 0.014*"secur" + 0.013*"applic" + 0.011*"institut" + 0.011*"mean" + 0.010*"section" + 0.010*"includ" + 0.009*"shall" + 0.009*"real"

Topic: 5 
Words: 0.040*"institut" + 0.025*"capit" + 0.021*"requir" + 0.020

#### Topic Modelling using TF-IDF 

In [20]:
lda_tfidf = LdaMulticore(tfidf_corpus, num_topics=20, id2word=dictionary, passes=2, workers=16)

In [21]:
for idx, topic in lda_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}\n'.format(idx, topic))

Topic: 0 
Word: 0.030*"author" + 0.009*"bank" + 0.008*"board" + 0.008*"issu" + 0.007*"feder" + 0.007*"associ" + 0.007*"seventh" + 0.007*"save" + 0.007*"transact" + 0.007*"nation"

Topic: 1 
Word: 0.008*"associ" + 0.008*"institut" + 0.008*"bank" + 0.007*"save" + 0.006*"feder" + 0.006*"secur" + 0.006*"hear" + 0.006*"section" + 0.006*"nation" + 0.005*"dealer"

Topic: 2 
Word: 0.011*"bank" + 0.010*"secur" + 0.009*"foreign" + 0.009*"compani" + 0.006*"branch" + 0.006*"institut" + 0.006*"feder" + 0.006*"entiti" + 0.006*"exposur" + 0.006*"transact"

Topic: 3 
Word: 0.013*"secur" + 0.012*"exchang" + 0.008*"account" + 0.007*"invest" + 0.007*"bank" + 0.007*"mean" + 0.007*"section" + 0.007*"share" + 0.007*"applic" + 0.006*"requir"

Topic: 4 
Word: 0.010*"bank" + 0.008*"reserv" + 0.007*"item" + 0.006*"comptrol" + 0.006*"state" + 0.006*"fiduciari" + 0.006*"secur" + 0.006*"nation" + 0.006*"person" + 0.005*"account"

Topic: 5 
Word: 0.009*"board" + 0.008*"secur" + 0.007*"institut" + 0.007*"account" + 