# Using LDA to analyse legisalative text

In [1]:
import PyPDF2

Extract and clean the text from the pdf file. We will use the Residential Tenancy Act as the legislative text

In [6]:
file = open('residential.pdf', 'rb')
fileReader = PyPDF2.PdfFileReader(file)
number_of_pages = fileReader.getNumPages()

print('number of pages:', number_of_pages)

number of pages: 131




In [13]:
page = fileReader.getPage(0)
page_content = page.extractText()

print(page_content)

This version of the legislation is compiled and maintained 
in a database of legislation by the Parliamentary Counsel™s
Office and published on the NSW legislation website.
New South WalesResidential Tenancies Act 2010 No42
Status informationCurrency of version
Historical version for 6 January 2012 to 5 July 2012 (generated 12 July 2012 at 11:13). 
Legislation on the NSW legislation website is usually updated within 3 working days.
Provisions in force
All the provisions displayed in this version of the legislation have commenced. For 
commencement and other details
 see the Historical notes.
Does not include amendments by:
Statute Law (Miscellaneous Provisions) Act 2012 No 42 (not commenced Š to commence 
on 6.7.2012)
See also:
Residential Tenancies Amendment (Occupancy Agreements) Bill 2011
Community Housing Providers (Adoption of National Law) Bill 2012



In [11]:
# put the whole document in a list
def extract_data(pages, fileReader):
    page_text = []
    
    for i in range(pages):
        page = fileReader.getPage(i)
        page_content = page.extractText()
        page_text.append(page_content)
        
    return page_text

data = extract_data(number_of_pages, fileReader)
    
print(data[0])
print(len(data))

This version of the legislation is compiled and maintained 
in a database of legislation by the Parliamentary Counsel™s
Office and published on the NSW legislation website.
New South WalesResidential Tenancies Act 2010 No42
Status informationCurrency of version
Historical version for 6 January 2012 to 5 July 2012 (generated 12 July 2012 at 11:13). 
Legislation on the NSW legislation website is usually updated within 3 working days.
Provisions in force
All the provisions displayed in this version of the legislation have commenced. For 
commencement and other details
 see the Historical notes.
Does not include amendments by:
Statute Law (Miscellaneous Provisions) Act 2012 No 42 (not commenced Š to commence 
on 6.7.2012)
See also:
Residential Tenancies Amendment (Occupancy Agreements) Bill 2011
Community Housing Providers (Adoption of National Law) Bill 2012

131


### Applying the LDA algorithm, implementation from sci-kit learn

In [12]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
NUM_TOPICS = 12

vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

print(data_vectorized[0])

  (0, 177)	1
  (0, 23)	1
  (0, 232)	1
  (0, 203)	2
  (0, 181)	1
  (0, 124)	1
  (0, 243)	1
  (0, 113)	1
  (0, 74)	1
  (0, 73)	2
  (0, 158)	1
  (0, 312)	3
  (0, 101)	1
  (0, 372)	1
  (0, 241)	1
  (0, 261)	1
  (0, 98)	1


In [15]:
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)



(131, 12)


In [16]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('account', 0.5148762479599001), ('bond', 0.34035789616743756), ('rental', 0.3375389609132897), ('division', 0.31924135523232616), ('director-general', 0.2863052785027784), ('accommodation', 0.27935096200997017), ('tribunal', 0.27456957839607715), ('powers', 0.25689589559370696), ('proceedings', 0.25519865580885887), ('board', 0.24770925108948907)]
Topic 1:
[('person', 30.737007307467316), ('landlord', 29.569870890836263), ('goods', 25.19525295141994), ('section', 25.14761182602638), ('bond', 18.181813237348653), ('paid', 14.133165129852793), ('documents', 13.771197264040975), ('offence', 13.368142974977701), ('penalty', 13.138750240630417), ('tenant', 12.713426590142564)]
Topic 2:
[('tribunal', 31.210229156875837), ('rental', 31.165442035191273), ('bond', 27.24196628666371), ('order', 24.15993497872607), ('director-general', 22.513160284434036), ('proceedings', 19.392553361377516), ('claim', 15.57346242494831), ('person', 14.572283284242435), ('tenant', 13.4674696

### Visualization with 12 topics

In [17]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

 [_prepare.py:257]


### What happens if we have fewer topics?

In [18]:
NUM_TOPICS = 5

lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)



(131, 5)


In [19]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

 [_prepare.py:257]


In [20]:
import numpy as np
import mglearn



In [22]:
sorting = np.argsort(lda_model.components_)[:, ::-1]
features = np.array(vectorizer.get_feature_names())



In [24]:
mglearn.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=20)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
agreement     landlord      sec           tenancy       landlord      
tenancy       notice        date          existing      tenant        
tenant        person        commencement  commencement  premises      
termination   tenant        provisions    does          agreement     
premises      section       law           months        tenancy       
landlord      premises      miscellaneous schedule      person        
section       tribunal      notes         clause        section       
notice        goods         amended       charges       rent          
term          board         used          agreement     agent         
order         rent          following     water         information   
tribunal      order         office        section       penalty       
housing       given         member        payment       paid          
fixed 



In [32]:
filters = [('agreement', ''), ('tenancy', ''), ('tenant', ''), 
           ('landlord', ''), ('section', ''), ('premises', ''), ('provision', ''), ('person', ''), ('tribunal', '')]

for k, v in filters:
    data = [d.replace(k, v) for d in data]



In [37]:
NUM_TOPICS = 5

data_vectorized = vectorizer.fit_transform(data)

lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.

 [_prepare.py:257]


In [39]:
sorting = np.argsort(lda_model.components_)[:, ::-1]
features = np.array(vectorizer.get_feature_names())
mglearn.tools.print_topics(topics=range(5), feature_names=features, sorting=sorting, topics_per_chunk=5, n_words=35)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
order         sec           information   rent          termination   
tribunal      commencement  database      term          notice        
possession    date          means         penalty       housing       
notice        existing      agent         paid          social        
agent         law           investigator  payable       term          
make          miscellaneous includes      agent         fixed         
proceedings   schedule      division      terms         given         
application   does          listing       notice        ground        
goods         clause        operator      consent       tribunal      
director-generalbond          does          tribunal      end           
given         months        right         regulations   division      
regulations   used          penalty       section       charges       
divi

