In [1]:

## view all outputs
import warnings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings("ignore")

## pdf reading
import os
import re

## dataframe
import pandas as pd
import numpy as np  


## preprocessing
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer


## add punctuation and some application-specific words
## to stopword list
from nltk.stem.porter import *
porter = PorterStemmer()
from nltk.tokenize import WordPunctTokenizer

## lda
from gensim import corpora
import gensim

# 0. Loading combined data


**Task**: unzip the folder

In [2]:
df = pd.read_pickle("../../intermediate_objects/hearings_raw_dc.pkl")
df.head()

Unnamed: 0,text,month_and_year
April_%283%29_35,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_%283%29_35
April_%284%29_37,District of Columbia \n\nOffice of the State S...,April_%284%29_37
April_2014_0,\n \n \n \n \n \n \n\n \n \n \n \n \n \n \n\...,April_2014_0
April_2014_1,District of Columbia \n\nOffice of the State S...,April_2014_1
April_2014_10,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_2014_10


In [None]:
## extract the month and year

In [70]:

df['month'] = df.month_and_year.astype(str).str.replace("\\_.*", "")
df['year'] = df.month_and_year.astype(str).str.extract(r'(201[4-9])', expand=False)
df['new_id'] = df.month_and_year.astype(str).str.replace(".*\\_", "")
df.month.value_counts()
df.dtypes

June         79
March        78
April        70
May          67
February     67
October      65
August       61
November     60
January      60
July         58
September    57
December     57
Name: month, dtype: int64

text              object
month_and_year    object
month             object
year              object
new_id            object
text_lower        object
case_num_try1     object
casenum_merge     object
dtype: object

In [40]:
case_num_pattern = "(?P<prefix>case\s+no(.?))(:\s+)?(?P<casenum>(\d){4}-.*)"
def get_casenum(one_str, pattern = case_num_pattern):
    
    search_obj = re.search(pattern, one_str)
    if search_obj:
        case_return = search_obj.group('casenum')
    else:
        case_return = np.nan
    return(case_return)
    
    

In [90]:
## try to extract the case number from the full text
df['text_lower'] = df.text.str.lower()

## try to get index of case no
all_cases = [get_casenum(one_case) for one_case in df.text_lower]
df['case_num_try1'] = all_cases

## extract case nums from about half of them 


## try merging with filings 
dc_filings = pd.read_csv("../../intermediate_objects/cleaned_df/dc_filings_wcrdc_nonagg.csv")

## try cleaning up
df['casenum_merge'] = df.case_num_try1.astype(str).str.replace("\\s+|\\(.*|\\&.*", "")
df['is_missing_casenum'] = df.case_num_try1.isnull()
dc_filings['casenum_merge'] = dc_filings.case_no.astype(str).str.replace("\\s+", "")

## look at hearings not in filings
hearings_nofile = set(df.casenum_merge[df.casenum_merge.notnull()]).difference(dc_filings.casenum_merge)
#hearings_nofile

## filter to date range

True     392
False    387
Name: is_missing_casenum, dtype: int64

In [91]:

conditions = [(df.year.isnull()),
                 (df.year == '2014') & 
                 (df.month.isin(["September", "October", "November", "December"])),
                 (df.year == '2018') *
                 (df.month.isin(["January", "February", "March", "April", "May", "June"])),
                df.year.isin(['2015', '2016', '2017', '2018']),
                df.year.isin(['2018', '2019']),
               df.year.isin(['2013', '2014'])]
code_to = ["include; date missing", 
          "include; fall 2014", 
          "include; winter 2018",
          "include; full SY", 
          "exclude; too late",
          "exclude; too early"]
assert len(conditions) == len(code_to)

df['category_include'] = np.select(conditions, code_to, default = 'other')

## subset
#df.category_include.value_counts()

df_trymerge = df[df.category_include.str.contains("include")].copy()

In [134]:
## look at overlap in case numbers

dc_hearings_wfilings = pd.merge(df_trymerge, dc_filings,
               on = 'casenum_merge',
               how = "left",
               indicator = "mergestatus_wfilings")

pd.crosstab(dc_hearings_wfilings.is_missing_casenum, dc_hearings_wfilings.mergestatus_wfilings)

## matched about 78% of those with observed case numbers to filings

## check differential attrition to an order in other script 

mergestatus_wfilings,left_only,both
is_missing_casenum,Unnamed: 1_level_1,Unnamed: 2_level_1
False,76,254
True,228,0


In [145]:
start_pattern = "(background and procedural history|introduction and procedural history|procedural (history|background))"

def find_start(one_text):
    
    if re.search(start_pattern, one_text):
        return True
    else:
        return False


dc_hearings_wfilings['is_startindic'] = [find_start(one_text) for one_text in dc_hearings_wfilings.text_lower]


## next step: take text after that and before signature to
## minimize administrative preface

In [None]:
## text features to extract:
### witnesses for each side
### findings of act
### remove text of exhibits

## figure out how to strip footnotes

In [123]:
dc_hearings_wfilings['is_mentions_cfsa'] = dc_hearings_wfilings.text_lower.str.contains('cfsa|foster')
dc_hearings_wfilings.head()

Unnamed: 0,text,month_and_year,month,year,new_id,text_lower,case_num_try1,casenum_merge,category_include,is_missing_casenum,...,TOTAL_RESTRAINT_SECLUDE,TOTAL_RESTRAINT_SECLUDE_rate,merge_ieprestraints,total_students_disc_data,TOTAL_DISCIPLINE,TOTAL_DISCIPLINE_rate,merge_iepdisc,crdc_status,mergestatus_wfilings,is_mentions_cfsa
0,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_%283%29_35,April,,35,office of the state superintendent of educatio...,2016-0036,2016-0036,include; date missing,False,...,0.0,0.0,both,1696.0,175.0,0.103184,both,both,both,False
1,District of Columbia \n\nOffice of the State S...,April_%284%29_37,April,,37,district of columbia \n\noffice of the state s...,2016-0031,2016-0031,include; date missing,False,...,15.0,0.045455,both,330.0,107.0,0.324242,both,both,both,False
2,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_2015_21,April,2015.0,21,office of the state superintendent of educatio...,,,include; full SY,True,...,,,,,,,,,left_only,True
3,\n\nDISTRICT OF COLUMBIA \n\nOFFICE OF THE ST...,April_2015_22,April,2015.0,22,\n\ndistrict of columbia \n\noffice of the st...,,,include; full SY,True,...,,,,,,,,,left_only,False
4,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_2015_23,April,2015.0,23,office of the state superintendent of educatio...,,,include; full SY,True,...,,,,,,,,,left_only,False


In [124]:
pattern = "(background and procedural history|introduction and procedural history|procedural history)"
test = re.search(pattern, dc_hearings_wfiling.text_lower.iloc[2])
test.start()
test.end()

356

391

In [130]:
### cleaning of text
#for text in dc_hearings_wfiling.text_lower.sample(n = 1, random_state = 91988):
 #   print(text)

In [97]:
phrases_remove = []"district of columbia"

to_remove = "\\\n|district of columbia|office of the state superintendent of education|"

Unnamed: 0,text,month_and_year,month,year,new_id,text_lower,case_num_try1,casenum_merge,category_include,is_missing_casenum,...,total_students_ressec_data,TOTAL_RESTRAINT_SECLUDE,TOTAL_RESTRAINT_SECLUDE_rate,merge_ieprestraints,total_students_disc_data,TOTAL_DISCIPLINE,TOTAL_DISCIPLINE_rate,merge_iepdisc,crdc_status,mergestatus_wfilings
0,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_%283%29_35,April,,35,office of the state superintendent of educatio...,2016-0036,2016-0036,include; date missing,False,...,1696.0,0.0,0.0,both,1696.0,175.0,0.103184,both,both,both
1,District of Columbia \n\nOffice of the State S...,April_%284%29_37,April,,37,district of columbia \n\noffice of the state s...,2016-0031,2016-0031,include; date missing,False,...,330.0,15.0,0.045455,both,330.0,107.0,0.324242,both,both,both
2,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_2015_21,April,2015.0,21,office of the state superintendent of educatio...,,,include; full SY,True,...,,,,,,,,,,left_only
3,\n\nDISTRICT OF COLUMBIA \n\nOFFICE OF THE ST...,April_2015_22,April,2015.0,22,\n\ndistrict of columbia \n\noffice of the st...,,,include; full SY,True,...,,,,,,,,,,left_only
4,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_2015_23,April,2015.0,23,office of the state superintendent of educatio...,,,include; full SY,True,...,,,,,,,,,,left_only


**Task**: we will discuss more preprocessing in the next section but for now, create a new column 'text_lower' that makes all the words in the text column lowercase. For this, it's easiest to use the pandas string method.

In [47]:
store_files_df['text_lower'] = store_files_df.text.str.lower()


wordpunct_tokenize = WordPunctTokenizer().tokenize

**Task**: create a binary variable indicating *whether* (yes or no) the text contains words you think reflect mother. Similarly, create a binary variable indicating *whether* (yes or no) the text contains words you think reflect father. Then, create a third variable indicating whether:

- The complaint mentions a child's mother only
- The complaint mentions a child's father only
- The complaint mentions both parents
- The complain mentions neither parents


Use df.varname.value_counts() to print the distribution of the categories.

In [40]:
store_files_df['any_mother'] = np.where(store_files_df.text_lower.str.contains("mother|mom"), 1, 0)
store_files_df['any_father'] = np.where(store_files_df.text_lower.str.contains("father|dad"), 1, 0)


conditions = [(store_files_df.any_mother.eq(1) & store_files_df.any_father.eq(1)), 
            (store_files_df.any_mother.eq(1) & store_files_df.any_father.eq(0)),
            (store_files_df.any_mother.eq(0) & store_files_df.any_father.eq(1)),
            (store_files_df.any_mother.eq(0) & store_files_df.any_father.eq(0))]
             
choices = ['both_parents', 'mother_only', 'father_only', 'neither_parent']

store_files_df['parent_cat'] = np.select(conditions, choices)
store_files_df.parent_cat.value_counts()


mother_only       372
neither_parent    247
both_parents      124
father_only        36
Name: parent_cat, dtype: int64

**Task**: you wonder if a complaint failing to mention both parents is (1) a reflection of both parents being involved, or (2) more related to longer complaints having more opportunities to discuss each of the parent's roles. To investigate this, we need to move to tools outside the pandas.varname.str.operation toolbox. Note that since we have not yet preprocessed the data, this count will be high, and will get much lower as we implement various preprocessing steps. 

For now, run the below code, and then we'll transition to nltk so you can learn more what it means. As a note, these counts will be very high because we have not yet removed punctuation!

Then, use the df.groupby('grouping_varname')['continuous_varname'].mean() command to contrast the mean across each of the four levels of the factor variable you created in the previous task. What do you notice?

In [48]:
def count_words(one_complaint):
    
    complaint_tokens = wordpunct_tokenize(one_complaint)
    n_tokens = len(set(complaint_tokens))
    return(n_tokens)


In [49]:
store_files_df['unique_words_punct'] = store_files_df.text_lower.apply(count_words)
store_files_df.head()

store_files_df.groupby('parent_cat')['unique_words_punct'].mean()

Unnamed: 0,text,month_and_year,month,year,id,text_lower,any_mother,any_father,parent_cat,unique_words_punct
April_%283%29_35,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_%283%29_35,April,,35,office of the state superintendent of educatio...,1,0,mother_only,1345
April_%284%29_37,District of Columbia \n\nOffice of the State S...,April_%284%29_37,April,,37,district of columbia \n\noffice of the state s...,1,1,both_parents,1307
April_2014_0,\n \n \n \n \n \n \n\n \n \n \n \n \n \n \n\...,April_2014_0,April,2014.0,0,\n \n \n \n \n \n \n\n \n \n \n \n \n \n \n\...,1,0,mother_only,1114
April_2014_1,District of Columbia \n\nOffice of the State S...,April_2014_1,April,2014.0,1,district of columbia \n\noffice of the state s...,1,0,mother_only,1006
April_2014_10,OFFICE OF THE STATE SUPERINTENDENT OF EDUCATIO...,April_2014_10,April,2014.0,10,office of the state superintendent of educatio...,0,0,neither_parent,1045


parent_cat
both_parents      1318.403226
father_only       1261.722222
mother_only       1187.685484
neither_parent    1192.947368
Name: unique_words_punct, dtype: float64

# 2. Using ntlk for text preprocessing

You probably noticed a few shortcomings of how we approached the previous task. In particular:
    
- We searched for a concept we already had in mind (gender of parent who's filing the complaint). What if we want to more inductively learn themes in the text data, without searching for pre-specified concepts?
- Even if searching for a pre-specified concept, we were lucky in that mother and father have a limited number of words that can describe them. What if we wanted to investigate something where we're not really able to exhaustively enumerate the ways it can be described?

That moves us to topic modeling, or ways to represent each complaint as having words that are drawn from latent themes/topics. The first step in topic modeling is pre-processing.  You probably noticed when reading through the texts that there are a lot of extra things--punctuation; words like "of"--that are not informative for learning themes in the text. so we're going to focus on that for the next part of the activity, using nltk, and then work on implementing a topic model.

**Task**: first, remove the stopwords using the built-in English stopwords within nltk (print to get a sense). Store the result in a new column: text_nostop_standard

Compare the text in a couple of the documents


In [50]:
stopwords_standard = set(stopwords.words('english'))
#stop_words

def remove_stop(row, colname, stopword_dict):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([i for i in wordpunct_tokenize(string_of_col) if 
                        i not in stopword_dict])  ## removed numeric
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)

In [52]:
store_files_df['text_nostop_standard'] = store_files_df.apply(remove_stop,
                                    axis = 1,
                                   args = ["text_lower", stopwords_standard])
print(store_files_df.text_lower[0][0:100])
print(store_files_df.text_nostop_standard[0][0:100])


office of the state superintendent of education

district of columbia

office of dispute resolution

office state superintendent education district columbia office dispute resolution 810 first street ,


**Task**: check what type of object the standard stopwords are (type(object)). Update the standard stopwords to include some words specific to this context that we don't want to include, provided for you below in the additional_words_toadd list.
    

In [53]:
additional_words_toadd = ["decision", "office", "petitioner", "dcps"]
stopwords_standard.update(additional_words_toadd)

**Task**: returning to the original text_lower column, remove stopwords using the updated stopword list. Store it as a column text_nostop. 

In [54]:
store_files_df['text_nostop'] = store_files_df.apply(remove_stop,
                                    axis = 1,
                                   args = ["text_lower", stopwords_standard])

**Task**: removing stopwords deletes entire words from the corpus. But we also want to preprocess the remaining words (1) to remove digits and punctuation (which we've decided are not relevant), and (2) reduce similar words to a common stem.

Using the text_nostop column, perform the following additional preprocessing steps.
- Stem using the porter stemmer
- Only keep words where all the characters are letters (so removes punctuation and numbers)
- Only keep words that are 3 characters or longer

Store the preprocessed text in a new column: text_preprocess

*Optional*: Rerun the above function to count the number of words in the processed text for each row and compare the distribution of count of words before preprocessing (text_lower) to the distribution of count of words after preprocessing.

In [55]:
def processtext(row, colname):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([porter.stem(i.lower()) for i in wordpunct_tokenize(string_of_col) if 
                        i.lower().isalpha() and len(i) >=3])  
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)

store_files_df['text_preprocess'] = store_files_df.apply(processtext,
                                    axis = 1,
                                   args = ["text_nostop"])

**Task**: the package structural topic models we'll work on next works best with text that is still stored in a dataset with a column. Write the data with the following cols to csv--data_preprocess.csv---to use in STM:
        
- month
- year
- hearing ID
- parent_cat
- text_preprocess

In [56]:
## write version to csv to read into R
store_files_df[['month', 'year', 'id', 'parent_cat',
               'text_preprocess']].to_csv("/Users/raj2/Dropbox/dph_hearing_decisions/data/dc/intermediate/hearings_preprocessed.csv",
                                  index = False)

In [25]:
## plot parents
data_wparents = pd.read_csv("../intermediate_objects/data_preprocess.csv")
data_wparents.head()

parent_prop = pd.DataFrame(data_wparents.parent_cat.value_counts(normalize = True))
parent_prop['category'] = parent_prop.index
parent_prop['category_clean'] = np.where(parent_prop.category == "mother_only",
                                        "Only mother",
                                np.where(parent_prop.category == "father_only",
                                        "Only father",
                                np.where(parent_prop.category == "both_parents",
                                        "Both parents",
                                        "Neither parent")))


from plotnine import *
parent_prop['category_reorder'] = parent_prop.category_clean.astype('category').cat.reorder_categories(['Only mother', 'Neither parent',
                                                     'Both parents',
                                                     "Only father"], inplace=False)
parent_graph = (ggplot(parent_prop, aes(x = 'category_reorder', y = 'parent_cat')) +
geom_bar(stat = "identity", fill = "firebrick", color = "black") +
geom_label(aes(x = 'category_reorder', y = 'parent_cat', label = 'round(parent_cat, 2)')) +
theme_classic() +
xlab("Who does hearing\nmention?") +
ylab("Percentage of hearings\n(DC: SY 2017-2018; 2018-2019)") +
theme(axis_text = element_text(color = "black")))

ggsave(parent_graph,file="../parent_breakdown.png")



Unnamed: 0,month,year,id,parent_cat,text_preprocess
0,April,2018,0,mother_only,state superintend educ review complianc disput...
1,April,2018,1,mother_only,state superintend educ district columbia dispu...
2,April,2018,10,mother_only,state superintend educ district columbia dispu...
3,April,2018,2,mother_only,state superintend educ district columbia dispu...
4,April,2018,3,mother_only,state superintend educ review complianc disput...


# 3. Using nltk to create a unigram document-term matrix

**Task**: use the function below to create two document-term matrices. First create one with the text before preprocessing (text_lower); then create one with the preprocessed text (text_preprocess). 

The first argument of the function is the column of text you want to convert. The second argument is a dataframe with document metadata--ior information that you might be using as exploratory variables and that you want to append to the dtm. In this case, attach the following columns:

- The month column
- The year column
- The id column
- The column indicating whether it was filed by mother, father, both parents, neither

Compare the dimensions of the two using df.shape.



In [72]:
def create_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names())
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1)
    return(dtm_dense_named_withid)

In [73]:
dtm_nopreprocess = create_dtm(list_of_strings = store_files_df.text_lower,
                              metadata = store_files_df[['id', 'month', 'year',
                                                        'parent_cat']])

In [74]:
print("The dimensions of the dtm before preprocessing are: " + str(dtm_nopreprocess.shape))

The dimensions of the dtm before preprocessing are: (182, 12467)


In [76]:
dtm_preprocess = create_dtm(list_of_strings = store_files_df.text_preprocess,
                              metadata = store_files_df[['id', 'month', 'year',
                                                        'parent_cat']])

In [77]:
print("The dimensions of the dtm after preprocessing are: " + str(dtm_preprocess.shape))

The dimensions of the dtm after preprocessing are: (182, 5937)


In [78]:
dtm_preprocess.head()

Unnamed: 0,index,id,month,year,parent_cat,aac,aba,abandon,abat,abbl,...,ÿÿÿÿÿ,ÿÿÿÿÿÿxxnÿÿ,ÿÿÿÿÿÿÿÿabcdefgÿiejekÿ,ÿÿÿÿÿÿÿÿÿ,ÿÿÿÿÿÿÿÿÿÿÿ,ÿÿÿÿÿÿÿÿÿÿÿbÿ,ÿÿÿÿÿÿÿÿÿÿÿÿl,ÿÿÿÿÿÿÿÿÿÿÿÿÿ,ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ,ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ
0,April_2018_0,0,April,2018,mother_only,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,April_2018_1,1,April,2018,mother_only,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,April_2018_10,10,April,2018,mother_only,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,April_2018_2,2,April,2018,mother_only,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,April_2018_3,3,April,2018,mother_only,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 4.  Using gensim to estimate a basic topic model

**Task**: it's useful to create the DTM to have a sense of how the topic model you estimate treats the data. But ultimately, you can feed the preprocessed text column to functions within the gensim package and it will create that representation as part of estimation. Store your text_preprocess column as all_stemmed_text and feed it to the code below

In [91]:
## re-tokenize and store in list
all_stemmed_text = store_files_df.text_preprocess
text_preprocess_tokens = [wordpunct_tokenize(one_row) for one_row 
                         in all_stemmed_text]


## get lower and upper thres for n documents a word should appear in
lower_thres = 10
upper_thres = dtm_preprocess.shape[0]-10

In [92]:
## set up for estimation

### create dictionary (all unique words and counts)
dictionary = corpora.Dictionary(text_preprocess_tokens)

### filter out words that are in almost none or almost all documents
dictionary.filter_extremes(no_below=lower_thres, no_above=upper_thres)

### use the dictionary to create the corpus
corpus = [dictionary.doc2bow(text) for text in text_preprocess_tokens]
num_topics = 10


In [93]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=10,
                                          alpha = 'auto',
                                          per_word_topics = True)

In [162]:
## store
ldamodel.save('../intermediate_objects/model_10topics.gensim')


In [94]:
## print topics
topics = ldamodel.print_topics(num_words = 20)
for topic in topics:
    print(topic)

(0, '0.090*"student" + 0.029*"school" + 0.021*"iep" + 0.019*"educ" + 0.011*"hear" + 0.009*"appropri" + 0.009*"provid" + 0.009*"servic" + 0.009*"behavior" + 0.008*"special" + 0.008*"exhibit" + 0.008*"meet" + 0.007*"offic" + 0.006*"placement" + 0.006*"team" + 0.005*"evid" + 0.005*"evalu" + 0.005*"child" + 0.005*"time" + 0.005*"determin"')
(1, '0.043*"student" + 0.040*"evalu" + 0.019*"hear" + 0.019*"educ" + 0.017*"parent" + 0.014*"school" + 0.013*"determin" + 0.012*"case" + 0.011*"offic" + 0.011*"psycholog" + 0.010*"provid" + 0.010*"elig" + 0.009*"comprehens" + 0.009*"process" + 0.009*"request" + 0.009*"conduct" + 0.008*"psychologist" + 0.008*"assess" + 0.008*"due" + 0.008*"counsel"')
(2, '0.062*"student" + 0.023*"school" + 0.017*"educ" + 0.015*"iep" + 0.014*"wit" + 0.010*"servic" + 0.010*"respond" + 0.009*"provid" + 0.009*"behavior" + 0.008*"district" + 0.008*"year" + 0.008*"hear" + 0.008*"issu" + 0.007*"testimoni" + 0.007*"special" + 0.006*"also" + 0.006*"child" + 0.006*"evalu" + 0.006*

In [95]:
## visualize
## move the lambda parameter down from 1 to see
## words more unique to a topic
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

## Way more examples of how to visualize: https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/