Here I create 15 appropriate topic models for the collection of bills from the past four years that I previously collected

In [None]:
from pymongo import MongoClient
config = {
    'host': 'xx.xxx.xxx.xx',
    'username': 'xxxxxxxx',
    'password': 'xxxxxxxx',
    'authSource': 'cool'
}

client = MongoClient(**config)
db = client.cool

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import re

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
import pyLDAvis
import pyLDAvis.sklearn

In [None]:
def text_clean(text):
    '''
    This function does the work of preprocessing on all bills by removing unnecessary characters and punctuation
    from the text and preparing it for NLP
    '''
    text = re.sub('\\n', ' ', text)
    text = re.sub('_', ' ', text)
    text = re.sub('\[.{,50}\]', ' ', text)
    text = re.sub('\<.{,10}\>', ' ', text)
    text = re.sub('[.\'`,;():\-$%&\^#?!><]', ' ', text).lower()
    text = re.sub('\w*\d+\w*', '', text)
    text = re.sub('\s[xvil]+\s', ' ', text)
    text = re.sub('\s{2,}', ' ', text)
    return text

I aggregated a number of frequently occurring words in the topic modeling process that failed to add any meaning to my topics and I added them to the stop words to later be used in CountVectorizer

In [None]:
real_stops = ['secretary', 'insert','inserting','striking','program','plan','note','rule','agency',
             'administration','subparagraph','later','services','appropriate','administrator',
             'commission','faa', 'national','usc','assistance','eligible','service','clause',
             'development','respect','apply','case','percent','determined','board','subtitle',
             'person','period','office','use','programs','individuals','systems','revies','study',
             'public','management','administering','imposed','authority','relating','state',
             'federal','project','law','review','central','agencies','subsection','san',
             'subsection','activities','attorney','department','resolution','joint','rules',
             'date','effect','described','motion','report','major','days','day','consideration',
             'order', 'amended','paragraph','entities','counsel','mr','ms','proposed','information',
             'local','written','affairs', 'non','year','month','members','committees','government',
             'head','member','general','shall','action','assessment','subject','including',
             'available','purposes','term','provided','remain','necessary','used','end','president',
             'zzz','zz','zuni','zuhair','zte','zour','zou','zoster','zoris','zor', 'heading','subheading',
             'chapter','change','new','subchapter','numerical','sequence','designed','appropriated',
             'september',]

I add my stop words to the list of English stop words already used in sklearn

In [None]:
from sklearn.feature_extraction import text
stops = text.ENGLISH_STOP_WORDS
stops = list(stops) + real_stops

Here, I iterate through every bill in my database. I add the preprocessed bill to the list 'texts' to be used for topic modeling, while all the other details of the bill are made into a dictionary and appended to the list 'details' to be converted into a DataFrame and used for reference.

In [None]:
details = []
texts = []
cursor = db.bills.find({})
for bill in cursor:
    texts.append(text_clean(bill['bill']))
    bill_dict = {'_id': bill['_id'], 'congress': bill['congress'], 'date': bill['date'],
                'track': bill['track'], 'sponsor': bill['name'], 'party': bill['party'],
                'state': bill['state']}
    details.append(bill_dict)

In [None]:
df_bills = pd.DataFrame(details)

In [None]:
df_bills.set_index('_id', inplace=True)

In [None]:
cv = CountVectorizer(max_df=0.8,stop_words=stops, token_pattern='[a-z]{4,}', max_features=3000)
X = cv.fit_transform(texts)
df = pd.DataFrame(X.toarray(), index=df_bills.index, columns=cv.get_feature_names())

I opted to use LDA for my topic modelling as these are larger texts. LDA seemed to provide better results than NMF for this data. I also utilize pyLDAviz to help determine the appropriacy of number of topics as well as identify characteristics of each topic

In [None]:
n_components = 15

lda_model = LatentDirichletAllocation(n_components=n_components,   
                                      max_iter=10,                 
                                      learning_method='online',   
                                      random_state=100,            
                                      n_jobs = -1,                
                                     )
lda_output = lda_model.fit_transform(df)

In [None]:
pyLDAvis.enable_notebook()

panel = pyLDAvis.sklearn.prepare(lda_model, X, cv)
panel

It's clear here that the distribution of topic assignment is reasonable and pyLDAviz demonstrates that there is not too much overlap between topics.

In [None]:
doc_cluster = lda_output.argmax(axis = 1)
df_cluster = pd.DataFrame(doc_cluster, index=df_bills.index)
df_cluster[0].value_counts()

In [None]:
df_bills.to_csv('bill_details.csv')

Here, I visualize the top 14 words of each topic in order to assign an appropriate name to each topic

In [None]:
model_words = lda_model.components_.argsort(axis=1)[:,-1:-15:-1]

In [None]:
words = [[df.columns[n] for n in row] for row in model_words]
list(enumerate(words))

In [None]:
topic_dict = {0: 'small business',
              1: 'health care',
              2: 'elections/voting',
              3: 'regulations*',
              4: 'labor/employee rights',
              5: 'border/immigration',
              6: 'education',
              7: 'law enforcement',
              8: 'budgetary',
              9: 'military',
              10: 'farmaceuticals/medicine',
              11: 'Trump response',
              12: 'natural resources',
              13: 'infrastructure',
              14: 'tax code'}

I then assign every bill its majority topic, add the topic to the bill on the dataframe, and write the dataframe to a csv for future use.

In [None]:
def name_topic(n):
    return topic_dict[n]
df_cluster['names'] = df_cluster[0].apply(name_topic)

In [None]:
df_bills['topic'] = df_cluster['names']

In [None]:
df_bills.to_csv('bill_details.csv')