In [1]:

## view all outputs
import warnings
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings("ignore")

## pdf reading
import zipfile
import os
import glob
import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
import io

## dataframe
import pandas as pd
import numpy as np  


## preprocessing
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import CountVectorizer


## add punctuation and some application-specific words
## to stopword list
from nltk.stem.porter import *
porter = PorterStemmer()
from nltk.tokenize import WordPunctTokenizer

## lda
from gensim import corpora
import gensim

In [2]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

# 0. Loading data


**Task**: unzip the folder

In [4]:
## store path to where the zip file
base_path = "/Users/raj2/Dropbox/dph_hearing_decisions/data/texas/" 

## unzip at that location if have not yet unzipped

**Task**: to build the first part of your loop, extract two strings from the filenames (stored in hearing_filenames)

1. The month
2. The year

Below provides some code for one file to get you started.

In [5]:
## read in texas filings names
texas_wnces = pd.read_csv(base_path + "intermediate/texas_filings_wnces.csv")

In [6]:
## clean the docket number
texas_wnces['clean_docket'] = texas_wnces['nan_docket_#'].astype(str).str.replace("\\s+", "")

## can then merge with frpl later

In [16]:
## get list of pathna
path_hearings = base_path  + "hearings"
os.chdir(path_hearings)
hearing_filenames = glob.glob("*pdf*")
hearing_filenames

docket_pattern = re.sub("\\_.*", "", hearing_filenames[0])
docket_pattern

['001-SE-0916_North%20East%20ISD.pdf',
 '002-SE-0917_El%20Paso%20ISD.pdf',
 '006-SE-0913_Leander.pdf',
 '009-SE-0916_Huntsville%20ISD.pdf',
 '009-SE-0917_Riesel%20ISD.pdf',
 '010-SE-0915_Edinburg.pdf',
 '011-SE-0914_WestOrangeCove.pdf',
 '012-SE-0913_SpringHill.pdf',
 '014-SE-0916_Killeen%20ISD.pdf',
 '015-SE-0914_GrapevineColleyville.pdf',
 '016-SE-0916_Abilene%20ISD.pdf',
 '017-SE-0915_Pflugerville.pdf',
 '021-SE-1016_Kirbyville%20CISD.pdf',
 '024-SE-0915_Gregory-Portland.pdf',
 '024-SE-1016_Northside.pdf',
 '027-SE-1017_Conroe%20ISD.pdf',
 '028-SE-0914_Canyon.pdf',
 '030-SE-1016_Cedar%20Hill%20ISD.pdf',
 '032-SE-0914_Robstown.pdf',
 '035-SE-1017_Leander%20ISD.pdf',
 '039-SE-1013_Lewisville.pdf',
 '040-SE-1017_Huntsville%20ISD.pdf',
 '041-SE-1013_Mesquite.pdf',
 '047-SE-1014_SanAntonio.pdf',
 '050-SE-1014_Lamar.pdf',
 '052-SE-1014_Dallas.pdf',
 '056-SE-1116_Round%20Rock%20ISD.pdf',
 '062-SE-1116_Georgetown%20ISD.pdf',
 '062-SE-1118_Kirbyville%20CISD.pdf',
 '066-SE-1013_Forney.pdf',
 

'001-SE-0916'

**Task**: now that you've build the two building blocks-- a given hearing pdf's month and year-- iterate through the first five of the hearing pdfs, read it in, and store in a dictionary where the key is formatted as: "[nameofmonth]_[year]_i", where i is the element of the list (otherwise, python would overwrite the value each time two hearings have the same month/year)

Note: this takes some time to run due to the pdf conversion; so test the loop with the first five and in the next task, you'll read in data that already has it stored

In [31]:
def read_hearing_pdf(one_path):
    text_hearing = convert_pdf_to_txt(one_path)
    extract_docket = re.sub(".*hearings/", "", one_path)
    return extract_docket, text_hearing

In [32]:
## iterate through files and use pdf to text function to convert
## store in list
store_files = dict()
hearings_withpath = [path_hearings + "/" + one_filename for one_filename in hearing_filenames]


['/Users/raj2/Dropbox/dph_hearing_decisions/data/texas/hearings/001-SE-0916_North%20East%20ISD.pdf',
 '/Users/raj2/Dropbox/dph_hearing_decisions/data/texas/hearings/002-SE-0917_El%20Paso%20ISD.pdf',
 '/Users/raj2/Dropbox/dph_hearing_decisions/data/texas/hearings/006-SE-0913_Leander.pdf',
 '/Users/raj2/Dropbox/dph_hearing_decisions/data/texas/hearings/009-SE-0916_Huntsville%20ISD.pdf']

'001-SE-0916_North%20East%20ISD.pdf'

In [33]:
## run external script

**Task**: Read in the data: data_raw.pkl using the pd.read_pickle command. Parse the month_year column parse into one column for month, another for year. Finally, add an ID to the hearing (order doesn't matter).

In [8]:
store_files_df_init = pd.read_pickle("/Users/raj2/Dropbox/dph_hearing_decisions/data/texas/intermediate/hearings_raw.pkl")
store_files_df_init['docket_num_clean'] = store_files_df_init.docket_num.astype(str).str.replace("\\_.*", "")

## merge with nces ids
store_files_df = pd.merge(store_files_df_init,
                             texas_wnces,
                             left_on = "docket_num_clean",
                             right_on = "clean_docket",
                             how = "left")


## then later match with nces dem after preprocessing
store_files_df.head()

Unnamed: 0,text,docket_num,docket_num_clean,(adv./tot.)_issues,date_hearina,date_last_order,due_date_decision,failed_extract_isd,isd_init,nan_decision_id,...,DISTRICT,DISTRICT_C,NCES_DISTR,COLOR,Shape_area,Shape_len,Shape__Area,Shape__Length,statelevel_id,clean_docket
0,\n\n \n \n \n\n \n \n \n \n \n\nPetitioner \n...,001-SE-0916_North%20East%20ISD.pdf,001-SE-0916,0/1,1917-05-03 00:00:00,1917-07-13 00:00:00,1916-11-14 00:00:00,0.0,15910.0,11963,...,015-910,15910.0,4832940.0,3.0,342770000.0,113966.846833,456334900.0,131484.820323,15910.0,001-SE-0916
1,"Petitioner \n\nSTUDENT, B/N/F PARENT, \n \n \n...",002-SE-0917_El%20Paso%20ISD.pdf,002-SE-0917,2/7,1918-03-28 00:00:00,1918-05-22 00:00:00,1917-12-16 00:00:00,0.0,71902.0,13035,...,071-902,71902.0,4818300.0,5.0,547564600.0,149222.911954,764604600.0,176412.992072,71902.0,002-SE-0917
2,\n \n \n \n \n \n \n \n \n \n\nSTUDENT \n \n ...,006-SE-0913_Leander.pdf,006-SE-0913,0/3,1913-12-16 00:00:00,1914-01-13 00:00:00,1913-11-19 00:00:00,0.0,246913.0,8953,...,246-913,246913.0,4827030.0,6.0,515860900.0,153901.806353,700272300.0,179284.138511,246913.0,006-SE-0913
3,\n\n \n \n \n\nStatement of the Case \n\n \n ...,009-SE-0916_Huntsville%20ISD.pdf,009-SE-0916,0/1,1916-12-14 00:00:00,1917-01-13 00:00:00,1916-11-27 00:00:00,0.0,236902.0,11981,...,236-902,236902.0,4824030.0,4.0,1540681000.0,236179.714272,2102015000.0,275948.88698,236902.0,009-SE-0916
4,DOCKET NO. 009-SE-0917 \n\nPetitioner \n\nSTUD...,009-SE-0917_Riesel%20ISD.pdf,009-SE-0917,0/1,1917-10-18 00:00:00,1917-10-30 00:00:00,1917-11-02 00:00:00,0.0,161912.0,13063,...,161-912,161912.0,4837110.0,4.0,151596700.0,79558.750571,209980500.0,93589.855445,161912.0,009-SE-0917


# 1. Using basic pandas string manipulation for text mining

You already did some text mining to figure out one type of basic descriptive--- what month and year a parent filed a complaint.

We're now going to have you did some text mining for a second type of descriptive information---are fathers or mothers mentioned more in the text? This relates to sociological work on gender inequalities in caregiving.

I'll provide you with some example code, and then you'll generalize to the full corpus

**Task**: look at a few of the texts. To print the full text, you can wrap it in print(df.textcolumn[0]), print(df.textcolumn[1]), etc.. Scroll through to find examples of it mentioning mothers or fathers. 

In [106]:
#print(store_files_df.text[0])

**Task**: we will discuss more preprocessing in the next section but for now, create a new column 'text_lower' that makes all the words in the text column lowercase. For this, it's easiest to use the pandas string method.

In [9]:
store_files_df['text_lower'] = store_files_df.text.str.lower()


wordpunct_tokenize = WordPunctTokenizer().tokenize

**Task**: create a binary variable indicating *whether* (yes or no) the text contains words you think reflect mother. Similarly, create a binary variable indicating *whether* (yes or no) the text contains words you think reflect father. Then, create a third variable indicating whether:

- The complaint mentions a child's mother only
- The complaint mentions a child's father only
- The complaint mentions both parents
- The complain mentions neither parents


Use df.varname.value_counts() to print the distribution of the categories.

In [10]:
store_files_df['any_mother'] = np.where(store_files_df.text_lower.str.contains("mother|mom"), 1, 0)
store_files_df['any_father'] = np.where(store_files_df.text_lower.str.contains("father|dad"), 1, 0)


conditions = [(store_files_df.any_mother.eq(1) & store_files_df.any_father.eq(1)), 
            (store_files_df.any_mother.eq(1) & store_files_df.any_father.eq(0)),
            (store_files_df.any_mother.eq(0) & store_files_df.any_father.eq(1)),
            (store_files_df.any_mother.eq(0) & store_files_df.any_father.eq(0))]
             
choices = ['both_parents', 'mother_only', 'father_only', 'neither_parent']

store_files_df['parent_cat'] = np.select(conditions, choices)
store_files_df.parent_cat.value_counts()


mother_only       45
neither_parent    39
both_parents      32
father_only        3
Name: parent_cat, dtype: int64

**Task**: you wonder if a complaint failing to mention both parents is (1) a reflection of both parents being involved, or (2) more related to longer complaints having more opportunities to discuss each of the parent's roles. To investigate this, we need to move to tools outside the pandas.varname.str.operation toolbox. Note that since we have not yet preprocessed the data, this count will be high, and will get much lower as we implement various preprocessing steps. 

For now, run the below code, and then we'll transition to nltk so you can learn more what it means. As a note, these counts will be very high because we have not yet removed punctuation!

Then, use the df.groupby('grouping_varname')['continuous_varname'].mean() command to contrast the mean across each of the four levels of the factor variable you created in the previous task. What do you notice?

In [11]:
def count_words(one_complaint):
    
    complaint_tokens = wordpunct_tokenize(one_complaint)
    n_tokens = len(set(complaint_tokens))
    return(n_tokens)


In [12]:
store_files_df['unique_words_punct'] = store_files_df.text_lower.apply(count_words)
store_files_df.head()

store_files_df.groupby('parent_cat')['unique_words_punct'].mean()

Unnamed: 0,text,docket_num,docket_num_clean,(adv./tot.)_issues,date_hearina,date_last_order,due_date_decision,failed_extract_isd,isd_init,nan_decision_id,...,Shape_len,Shape__Area,Shape__Length,statelevel_id,clean_docket,text_lower,any_mother,any_father,parent_cat,unique_words_punct
0,\n\n \n \n \n\n \n \n \n \n \n\nPetitioner \n...,001-SE-0916_North%20East%20ISD.pdf,001-SE-0916,0/1,1917-05-03 00:00:00,1917-07-13 00:00:00,1916-11-14 00:00:00,0.0,15910.0,11963,...,113966.846833,456334900.0,131484.820323,15910.0,001-SE-0916,\n\n \n \n \n\n \n \n \n \n \n\npetitioner \n...,1,1,both_parents,1535
1,"Petitioner \n\nSTUDENT, B/N/F PARENT, \n \n \n...",002-SE-0917_El%20Paso%20ISD.pdf,002-SE-0917,2/7,1918-03-28 00:00:00,1918-05-22 00:00:00,1917-12-16 00:00:00,0.0,71902.0,13035,...,149222.911954,764604600.0,176412.992072,71902.0,002-SE-0917,"petitioner \n\nstudent, b/n/f parent, \n \n \n...",0,1,father_only,1409
2,\n \n \n \n \n \n \n \n \n \n\nSTUDENT \n \n ...,006-SE-0913_Leander.pdf,006-SE-0913,0/3,1913-12-16 00:00:00,1914-01-13 00:00:00,1913-11-19 00:00:00,0.0,246913.0,8953,...,153901.806353,700272300.0,179284.138511,246913.0,006-SE-0913,\n \n \n \n \n \n \n \n \n \n\nstudent \n \n ...,1,1,both_parents,1011
3,\n\n \n \n \n\nStatement of the Case \n\n \n ...,009-SE-0916_Huntsville%20ISD.pdf,009-SE-0916,0/1,1916-12-14 00:00:00,1917-01-13 00:00:00,1916-11-27 00:00:00,0.0,236902.0,11981,...,236179.714272,2102015000.0,275948.88698,236902.0,009-SE-0916,\n\n \n \n \n\nstatement of the case \n\n \n ...,0,0,neither_parent,571
4,DOCKET NO. 009-SE-0917 \n\nPetitioner \n\nSTUD...,009-SE-0917_Riesel%20ISD.pdf,009-SE-0917,0/1,1917-10-18 00:00:00,1917-10-30 00:00:00,1917-11-02 00:00:00,0.0,161912.0,13063,...,79558.750571,209980500.0,93589.855445,161912.0,009-SE-0917,docket no. 009-se-0917 \n\npetitioner \n\nstud...,1,0,mother_only,865


parent_cat
both_parents      1910.625000
father_only       1240.666667
mother_only       1609.111111
neither_parent    1175.769231
Name: unique_words_punct, dtype: float64

# 2. Using ntlk for text preprocessing

You probably noticed a few shortcomings of how we approached the previous task. In particular:
    
- We searched for a concept we already had in mind (gender of parent who's filing the complaint). What if we want to more inductively learn themes in the text data, without searching for pre-specified concepts?
- Even if searching for a pre-specified concept, we were lucky in that mother and father have a limited number of words that can describe them. What if we wanted to investigate something where we're not really able to exhaustively enumerate the ways it can be described?

That moves us to topic modeling, or ways to represent each complaint as having words that are drawn from latent themes/topics. The first step in topic modeling is pre-processing.  You probably noticed when reading through the texts that there are a lot of extra things--punctuation; words like "of"--that are not informative for learning themes in the text. so we're going to focus on that for the next part of the activity, using nltk, and then work on implementing a topic model.

**Task**: first, remove the stopwords using the built-in English stopwords within nltk (print to get a sense). Store the result in a new column: text_nostop_standard

Compare the text in a couple of the documents


In [13]:
stopwords_standard = set(stopwords.words('english'))
#stop_words

def remove_stop(row, colname, stopword_dict):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([i for i in wordpunct_tokenize(string_of_col) if 
                        i not in stopword_dict])  ## removed numeric
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)

In [14]:
store_files_df['text_nostop_standard'] = store_files_df.apply(remove_stop,
                                    axis = 1,
                                   args = ["text_lower", stopwords_standard])
print(store_files_df.text_lower[0][0:100])
print(store_files_df.text_nostop_standard[0][0:100])


 

 
 
 

 
 
 
 
 

petitioner 
 
 
 

student b/n/f parent and    
parent, 
 
 
 
 
 
v.  
 
 
 
n
petitioner student b / n / f parent parent , v . north east independent school district , respondent


**Task**: check what type of object the standard stopwords are (type(object)). Update the standard stopwords to include some words specific to this context that we don't want to include, provided for you below in the additional_words_toadd list.
    

In [15]:
additional_words_toadd = ["decision", "office", "petitioner", "texas"]
stopwords_standard.update(additional_words_toadd)

**Task**: returning to the original text_lower column, remove stopwords using the updated stopword list. Store it as a column text_nostop. 

In [16]:
store_files_df['text_nostop'] = store_files_df.apply(remove_stop,
                                    axis = 1,
                                   args = ["text_lower", stopwords_standard])

**Task**: removing stopwords deletes entire words from the corpus. But we also want to preprocess the remaining words (1) to remove digits and punctuation (which we've decided are not relevant), and (2) reduce similar words to a common stem.

Using the text_nostop column, perform the following additional preprocessing steps.
- Stem using the porter stemmer
- Only keep words where all the characters are letters (so removes punctuation and numbers)
- Only keep words that are 3 characters or longer

Store the preprocessed text in a new column: text_preprocess

*Optional*: Rerun the above function to count the number of words in the processed text for each row and compare the distribution of count of words before preprocessing (text_lower) to the distribution of count of words after preprocessing.

In [17]:
def processtext(row, colname):
    
    string_of_col = str(row[colname])
    try:
        processed_string = " ".join([porter.stem(i.lower()) for i in wordpunct_tokenize(string_of_col) if 
                        i.lower().isalpha() and len(i) >=3])  
        return(processed_string)
    except:
        processed_string = "" # to handle data errors where not actually text
        return(processed_string)

store_files_df['text_preprocess'] = store_files_df.apply(processtext,
                                    axis = 1,
                                   args = ["text_nostop"])

**Task**: the package structural topic models we'll work on next works best with text that is still stored in a dataset with a column. Write the data with the following cols to csv--data_preprocess.csv---to use in STM:
        
- month
- year
- hearing ID
- parent_cat
- text_preprocess

In [58]:
base_path

'/Users/raj2/Dropbox/dph_hearing_decisions/data/texas/'

In [18]:
store_files_df.head()
dem_char = pd.read_csv(base_path +"cleaned/filings_withdem.csv")

dem_char_tomerge = dem_char[['nces_id', 'frpl_eligible_rate']]
dem_char_tomerge['nces_id_tomerge'] = dem_char_tomerge.nces_id.astype(str)

store_files_df['nces_id'] = store_files_df.NCES_DISTR.astype(str).str.replace("\\..*", "")
store_files_df_wdem = pd.merge(store_files_df,
                              dem_char_tomerge,
                              left_on = "nces_id",
                            right_on = "nces_id_tomerge",
                              how = "left")

cols_touse = ['docket_num_clean', 'year_request', 'parent_cat',
              'text_preprocess',
              'nces_id_tomerge',
              'frpl_eligible_rate']


Unnamed: 0,text,docket_num,docket_num_clean,(adv./tot.)_issues,date_hearina,date_last_order,due_date_decision,failed_extract_isd,isd_init,nan_decision_id,...,statelevel_id,clean_docket,text_lower,any_mother,any_father,parent_cat,unique_words_punct,text_nostop_standard,text_nostop,text_preprocess
0,\n\n \n \n \n\n \n \n \n \n \n\nPetitioner \n...,001-SE-0916_North%20East%20ISD.pdf,001-SE-0916,0/1,1917-05-03 00:00:00,1917-07-13 00:00:00,1916-11-14 00:00:00,0.0,15910.0,11963,...,15910.0,001-SE-0916,\n\n \n \n \n\n \n \n \n \n \n\npetitioner \n...,1,1,both_parents,1535,"petitioner student b / n / f parent parent , v...","student b / n / f parent parent , v . north ea...",student parent parent north east independ scho...
1,"Petitioner \n\nSTUDENT, B/N/F PARENT, \n \n \n...",002-SE-0917_El%20Paso%20ISD.pdf,002-SE-0917,2/7,1918-03-28 00:00:00,1918-05-22 00:00:00,1917-12-16 00:00:00,0.0,71902.0,13035,...,71902.0,002-SE-0917,"petitioner \n\nstudent, b/n/f parent, \n \n \n...",0,1,father_only,1409,"petitioner student , b / n / f parent , v . el...","student , b / n / f parent , v . el paso indep...",student parent paso independ school district r...
2,\n \n \n \n \n \n \n \n \n \n\nSTUDENT \n \n ...,006-SE-0913_Leander.pdf,006-SE-0913,0/3,1913-12-16 00:00:00,1914-01-13 00:00:00,1913-11-19 00:00:00,0.0,246913.0,8953,...,246913.0,006-SE-0913,\n \n \n \n \n \n \n \n \n \n\nstudent \n \n ...,1,1,both_parents,1011,student vs . leander . . . docket . 006 - se -...,student vs . leander . . . docket . 006 - se -...,student leander docket special educ hear offic...
3,\n\n \n \n \n\nStatement of the Case \n\n \n ...,009-SE-0916_Huntsville%20ISD.pdf,009-SE-0916,0/1,1916-12-14 00:00:00,1917-01-13 00:00:00,1916-11-27 00:00:00,0.0,236902.0,11981,...,236902.0,009-SE-0916,\n\n \n \n \n\nstatement of the case \n\n \n ...,0,0,neither_parent,571,statement case § § § § § § § hearing officer d...,statement case § § § § § § § hearing officer d...,statement case hear offic docket state special...
4,DOCKET NO. 009-SE-0917 \n\nPetitioner \n\nSTUD...,009-SE-0917_Riesel%20ISD.pdf,009-SE-0917,0/1,1917-10-18 00:00:00,1917-10-30 00:00:00,1917-11-02 00:00:00,0.0,161912.0,13063,...,161912.0,009-SE-0917,docket no. 009-se-0917 \n\npetitioner \n\nstud...,1,0,mother_only,865,"docket . 009 - se - 0917 petitioner student , ...","docket . 009 - se - 0917 student , v . riesel ...",docket student riesel independ school district...


In [19]:
## write version to csv to read into R
store_files_df_wdem[cols_touse].to_csv("/Users/raj2/Dropbox/dph_hearing_decisions/data/texas/intermediate/hearings_preprocessed.csv",
            index = False)