# Clean and link filings



## 0. Imports and functions

In [251]:
from tabula import read_pdf
import os
import pandas as pd
import pickle
import re
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

## profiling
import time



## first, clean case type
def process_type(one_row):
    
    ## some dates so convert to string
    one_string = str(one_row)
    
    ## clean for expedited discipline
    clean_exp_1 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', one_string)
    clean_exp_2 = re.sub(r'(Exped(i)?(t)?(e)?|Discip)\s+', r'\1', clean_exp_1)
    
    ## clean for lea
    clean_lea = re.sub(r'(Aga(i)?(n)?)\s+', r'\1', clean_exp_2)
    
    return(clean_lea)

def process_schoolname(one_name):
    
    ## uppercase
    name_str = str(one_name)
    name_upper = name_str.upper()

    ## clean up schools
    clean_school= re.sub(r'(SCHOO)\s+', r'\1', name_upper)
    clean_middle = re.sub(r'(MIDD)\s+', r'\1', clean_school)
    clean_ed = re.sub(r'(EDUCAT)\s+', r'\1', clean_middle)
    
    ## concat whitespace
    replace_middle = re.sub(r'M(\s)?I(\s)?D(\s)?D(\s)?L(\s)?E', r"MIDDLE", clean_ed)
    replace_elem = re.sub(r'E(\s)?L(\s)?E(\s)?M(\s)?E(\s)?N(\s)?T(\s)?A(\s)?R(\s)?Y', r"ELEMENTARY", replace_middle)
    replace_school = re.sub(r'SCHOOI', "SCHOOL", replace_elem)
    replace_campus = re.sub(r'C(\s)?A(\s)?M(\s)?P(\s)?U(\s)?S', r"CAMPUS", replace_school)
    replace_education = re.sub(r'E(\s)?D(\s)?U(\s)?C(\s)?A(\s)?T(\s)?I(\s)?O(\s)?N', r"EDUCATION", 
                               replace_campus)
    
    ## 

    return(replace_education)

def replace_schooltype(one_string):
    
    es = re.sub(r'ES$|ELEMENTARY$', r'ELEMENTARY SCHOOL', one_string)
    ec = re.sub(r'EC$', r'ELEMENTARY CAMPUS', es)
    ms = re.sub(r'MS$|MIDDLE$', r'MIDDLE SCHOOL', ec)
    hs = re.sub(r'HS$|HIGH$', r'HIGH SCHOOL', ms)
    
    return(hs)


    

def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

from sklearn.feature_extraction.text import TfidfVectorizer


def find_fuzzy_namematches(one_name: str, all_names: list, 
                           score_cutoff):
    
    ## extract matches above cutoff
    all_abovecutoff = process.extractBests(one_name, all_names, score_cutoff = score_cutoff,
                                          limit = 1)
    
    ## make into a dataframe (will thus only capture ones with matches)
    all_abovecutoff_df = pd.DataFrame(list(all_abovecutoff), columns = ['matched_name', 'score'])
    all_abovecutoff_df['original_name'] = one_name
    return(all_abovecutoff_df)

## resource-- package installation issues: https://bergvca.github.io/2017/10/14/super-fast-string-matching.html



In [None]:
## try redoing the parsing to get
## date closed

## 1. Load and do prelim cleaning of filings data

In [164]:
dc_filings_init = pd.read_csv("../data/dc/intermediate/processed_filings.csv")

dc_filings_init['failed_parse'] = np.where(dc_filings_init.eq(dc_filings_init.iloc[:, 0], 
                                axis=0).all(1), 1, 0)


## get row number of those that failed parse to reprocess
rownums_failedparse = pd.DataFrame({'missing_info':
                dc_filings_init.loc[dc_filings_init.failed_parse == 1].index.tolist()})

## write those and go back to process tables, pulling all cols for those rows
rownums_failedparse.to_pickle("../data/dc/intermediate/rownums_failedparse.pickle")


## subset to ones that parsed
dc_filings = dc_filings_init.loc[dc_filings_init.failed_parse == 0, ].copy()

"""After removing those that failed to parse, go from {} filings to {} filings.
""".format(dc_filings_init.shape[0],
          dc_filings.shape[0])



'After removing those that failed to parse, go from 7949 filings to 7752 filings.\n'

In [167]:
dc_filings['casetype_clean_init'] = [process_type(one_type) for one_type in dc_filings.casetype.tolist()]
dc_filings['casetype_isdigits'] = ["digits" if re.match(r'[0-9]+', one_str) is not None  else "no_digits" 
        for one_str in dc_filings.casetype_clean_init]

## by subsetting to those, see that year is still in the case so don't need to use for that
dc_filings['casetype_final'] = np.where((dc_filings.casetype_clean_init.str.contains("Discip")) |
                                        (dc_filings.casetype_clean_init.str.contains("Expedited")),
                                        "Expedited Discipline",
                                np.where((dc_filings.casetype_clean_init.str.contains("LEA")) & 
                                         (dc_filings.casetype_clean_init != "By LEA"), "Against LEA",
                                np.where(dc_filings.casetype_clean_init == "By LEA", "By LEA",
                                np.where(dc_filings.casetype_clean_init.str.contains("Against SE"),
                                        "Against SEA",
                                        "Other/failed to parse"))))


## write the failed to parse ones
## write those and go back to process tables, pulling the rows manually
dc_filings[dc_filings.casetype_final == "Other/failed to parse"].to_csv("../data/dc/intermediate/missing_casetype.csv")


## get range of dates of the filings
dc_filings['year_init'] = [re.sub(r'^(20[1-2][0-9]).*', r'\1', str(one_string)) for one_string in 
                      dc_filings.case_no]
year_range = [str(i) for i in np.arange(2012, 2020).tolist()]
dc_filings['year'] = np.where(dc_filings.year_init.isin(year_range), dc_filings.year_init,
                             'failed_toparse')
dc_filings.year.value_counts() # half the year in 2019

## 

2012              1651
2013              1459
2014              1023
2015              1001
2018               834
2017               702
2016               626
2019               269
failed_toparse     187
Name: year, dtype: int64

In [129]:
dc_filings.columns

Index(['attending_school', 'case_no', 'casetype', 'dcps_school_against',
       'home_school', 'failed_parse', 'casetype_clean_init',
       'casetype_isdigits', 'casetype_final', 'year_init', 'year'],
      dtype='object')

## 2. Load district demographic data

### 2.1 Create name-nces ID crosswalk

In [93]:
cc_data = pd.read_csv("../data/dc/intermediate/dc_ccd.csv")

In [252]:
cc_cleancols = [re.sub("\\s+|\\[|\\]", "_", x).upper() for x in cc_data.columns]
cc_data.columns = cc_cleancols

## create crosswalk to do matching
cc_crosswalk = cc_data[['SCHOOL_NAME', 
                        'SCHOOL_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR']].copy()
cc_crosswalk.head()




cc_crosswalk['name_tocompare'] = [replace_schooltype(one_school) for one_school in cc_crosswalk.SCHOOL_NAME]


Unnamed: 0,SCHOOL_NAME,SCHOOL_ID_-_NCES_ASSIGNED__PUBLIC_SCHOOL__LATEST_AVAILABLE_YEAR
0,ACADEMY OF HOPE ADULT PCS,110009000000.0
1,ACHIEVEMENT PREPARATORY PCS ELEMENTARY,110007000000.0
2,ACHIEVEMENT PREPARATORY PCS MIDDLE SCHOOL,110007000000.0
3,ADAMS ELEMENTARY SCHOOL ...,110003000000.0
4,ADVANCED PATH ACADEMY,110003000000.0


## 2.2 Clean school name and fuzzy matching to IDs

In [264]:
## preprocess school to clean
## and do fuzzy matching
dc_filings_tomatch = dc_filings[['case_no', 'dcps_school_against', 'year']].drop_duplicates(subset = 
                                                ['case_no',
                                                'dcps_school_against'])



dc_filings_tomatch['school_against_cleaned_1'] = [process_schoolname(one_name) 
                                                for one_name in dc_filings_tomatch.dcps_school_against]
dc_filings_tomatch['school_against_cleaned'] = [replace_schooltype(one_name)
                                               for one_name in dc_filings_tomatch.school_against_cleaned_1]



## generate tf-idf representation
filings_crosswalk = dc_filings_tomatch[['school_against_cleaned']].drop_duplicates()
filings_crosswalk['id'] = filings_crosswalk.index+1

## 

subset_names = filings_crosswalk.school_against_cleaned.sample(20)


## write to intermediate
cc_crosswalk.to_csv("../data/dc/intermediate/nces_schoolnames.csv")
filings_crosswalk.to_csv("../data/dc/intermediate/filings_names.csv")



all_names = cc_crosswalk.name_tocompare.tolist()




In [260]:
t0 = time.time()


In [258]:
[school for school in all_names if "LECKIE" in school]

['LECKIE ELEMENTARY CAMPUS']

In [261]:
fuzzymatch_results_list = [find_fuzzy_namematches(name, all_names, 90) 
                           for name in subset_names]
t1 = time.time()
print("Fuzzy matching took " + str(t1-t0) + " seconds to run")



Fuzzy matching took 9.110193252563477 seconds to run


In [262]:
fuzzymatch_results_df = pd.concat(fuzzymatch_results_list)
fuzzymatch_results_df

[name for name in subset_names 
 if name not in fuzzymatch_results_df.original_name.tolist()]


## test
#test_name = 
#split_space =  

Unnamed: 0,matched_name,score,original_name
0,RANDLE HIGHLANDS ELEMENTARY SCHOOL,97,RANDL E-HIGH LANDS ELEMENTARY SCHOOL
0,HEARST ELEMENTARY SCHOOL,100,HEARST ELEMENTARY SCHOOL
0,EATON ELEMENTARY SCHOOL,100,EATON ELEMENTARY SCHOOL
0,KRAMER MIDDLE SCHOOL,98,KRAME R MIDDLE SCHOOL
0,PHELPS ARCHITECTURE CONSTRUCTION AND ENGINEERI...,97,"PHELPS ARCHITECTURE, CONSTRUCTION, AND ENG INE..."
0,ELIOT HINE MIDDLE SCHOOL,95,HINE MIDDLE SCHOOL
0,POWELL ELEMENTARY SCHOOL,98,POWE LL ELEMENTARY SCHOOL


['CHILDRENS GUIL D - BALTIMORE',
 '07/18/2014',
 'FRANCIS-STEVENSEDUCATION CENTER',
 'HARDY M S @ HAMILTON SCHOOL',
 'BELL M ULTIC ULT URAL SENIOR HIGH SCHOOL',
 'WATK INS ELEMENTARY SCHOOL',
 'W ILLIAM E. DOAR, JR. PCS FOR THE PERFORMING ARTS',
 'SCHOOL-WITHIN-SC HOOL@ PEABODY',
 'ADV ANCE PATH',
 '03/26/ 2019',
 'FRANCIS-STEVENS EDUCATIONC',
 'NAT IONA L COLLEGIATE PCS',
 'TAKOMA EDUCATIONAL']

In [1]:
import shogun
from shogun import StringCharFeatures, RAWBYTE
from shogun import BinaryLabels
from shogun import SubsequenceStringKernel
from shogun import LibSVM
import spacy
!python -m spacy download en
import io
import os
import glob
import numpy as np  
import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from sklearn.feature_extraction.text import CountVectorizer
import re
import string
import difflib
from difflib import SequenceMatcher
from heapq import nlargest as _nlargest
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## string cleaning
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import wordpunct_tokenize

from fuzzywuzzy import process, fuzz

## sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

## link processing
import urllib
from bs4 import BeautifulSoup


[33mYou are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m

[93m    Linking successful[0m
    /anaconda3/lib/python3.6/site-packages/en_core_web_sm -->
    /anaconda3/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')





[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/raj2/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
print("Loaded imports successfully")

Loaded imports successfully


### general-purpose string/text functions

In [19]:
## function (from stackoverflow)
## that
## takes in:
## @path: pathname
## returns:
## string
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

def lower_removepunct(string_toclean):
    string_lower = string_toclean.lower()
    string_lower_char = "".join(x for x in string_lower if x not in string.punctuation) 
    return(string_lower_char)

occurrences = lambda s, lst: (i for i,e in enumerate(lst) if s in e)

def get_close_matches_indexes(word, possibilities, n=3, cutoff=0.6):
    """Use SequenceMatcher to return a list of the indexes of the best 
    "good enough" matches. word is a sequence for which close matches 
    are desired (typically a string).
    possibilities is a list of sequences against which to match word
    (typically a list of strings).
    Optional arg n (default 3) is the maximum number of close matches to
    return.  n must be > 0.
    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
    that don't score at least that similar to word are ignored.
    """

    if not n >  0:
        raise ValueError("n must be > 0: %r" % (n,))
    if not 0.0 <= cutoff <= 1.0:
        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
    result = []
    s = SequenceMatcher()
    s.set_seq2(word)
    for idx, x in enumerate(possibilities):
        s.set_seq1(x)
        if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff:
            result.append((s.ratio(), idx))

    # Move the best scorers to head of list
    result = _nlargest(n, result)

    # Strip scores for the best n matches
    return [x for score, x in result]

## function for dtm representation
def create_nonmasked_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names())
    dtm_dense_named_withid = pd.concat([metadata, dtm_dense_named], axis = 1)
    return(dtm_dense_named)

def create_sentiment_dataframe(list_ofsentiment_dicts, metadata):
    sentiment_df = pd.DataFrame.from_records(list_ofsentiment_dicts).fillna(0)
    sentiment_df.columns = ['sentimentsummary_'+ str(i) for i in sentiment_df.columns]
    sentiment_withid = pd.concat([metadata, sentiment_df], axis = 1)
    return(sentiment_withid)

def create_distance_matrix(full_essays, ids):
    string_features = StringCharFeatures(full_essays, RAWBYTE)
    sk = SubsequenceStringKernel(string_features, string_features, 3, 0.5)
    sk_matrix = sk.get_kernel_matrix()
    sk_df = pd.DataFrame(sk_matrix)
    sk_df.columns = ['id_'+ str(i) for i in ids]
    return(sk_df)
    

### functions for case processing

In [20]:
def return_links_todocs(url, prefix = None):
    
    ## call link and open
    request = urllib.request.urlopen(url)
    opened_page = request.read()
    
    ## parse page and find all links
    parsed_page = BeautifulSoup(opened_page, "lxml")
    links_page = parsed_page.findAll('a')
    
    ## iterate over links and return all
    all_links = [a['href'] for a in links_page]
    
    ## if there's a prefix to subset by, add that
    if prefix is not None:
        prefix_links = [link for link in all_links if link.startswith(prefix)]
        
        return(prefix_links)
    
    else:
        
        return(all_links)
    
## for each case, create dataframe
def get_metadata_andtext(parsed_page):
    
    case_metadata = parsed_page.find_all('meta', attrs = {'name': True})
    what_to_extract = ['docket', 'decided', 'caption', 'judge', 'summary']
    case_metadata_content = [tag['content'] for tag in case_metadata if tag['name'] in what_to_extract]
    case_dict = dict(zip(what_to_extract, case_metadata_content))
    case_text = parsed_page.find('p').getText()
    case_dict['full_text'] = case_text
    case_dict_df = pd.DataFrame(case_dict, index = [0])
    return(case_dict_df)
    
def split_caption(data, nameof_captioncol, delimiter = 'v.'):
    
    caption_split = data[nameof_captioncol].str.split(delimiter, 1).tolist()
    plaintiff = [item[0] if isinstance(item, list) else 'Bad split' for item in caption_split]
    defendant = [item[1] if isinstance(item, list) and len(item) > 1 else 'Bad split' for item in caption_split]
    return(plaintiff, defendant)

def extract_clean_defendants(row):
    
    if row.count_districts_defendant == 0:

        def_clean = ' '.join(row.defendant.split())
        return def_clean, None, None
    
    else:
        
        list_form = row.defendant.split(" ")
        board_positions = list(occurrences("BOARD", list_form))
        board_start = [i-5 if i-5 >= 0 else 0 for i in board_positions]
        all_dist = []
        for i in range(0, len(board_positions)):
    
            dist_init = list_form[board_start[i]:board_positions[i]+1]
            dist_clean_1 = " ".join([i for i in dist_init if i != 'EDUCATION,' and
                                                i != "AND" and
                                                i != "EDUCATION" and
                                                i != "" and 
                                                i != ","] + ["OF EDUCATION"])
            dist_clean_2 = ' '.join(dist_clean_1.split())
            
            all_dist.append(dist_clean_2)
            
        if len(all_dist) == 0:
            
            return None, None, None
 
        elif len(all_dist) == 1:
            
            return all_dist[0], None, None
            
        elif len(all_dist) == 2:
            
            return all_dist[0], all_dist[1], None
        
        elif len(all_dist) >= 3:
            
            return all_dist[0], all_dist[1], all_dist[2]
        
def extract_clean_plaintiffs(row):
    
    if row.count_districts_plaintiff == 0:

        def_clean = ' '.join(row.plaintiff.split())
        return(['parent'])
    
    else:
        
        list_form = row.plaintiff.split(" ")
        board_positions = list(occurrences("BOARD", list_form))
        board_start = [i-5 if i-5 >= 0 else 0 for i in board_positions]
        all_dist = []
        for i in range(0, len(board_positions)):
    
            dist_init = list_form[board_start[i]:board_positions[i]+1]
            dist_clean_1 = " ".join([i for i in dist_init if i != 'EDUCATION,' and
                                                i != "AND" and
                                                i != "EDUCATION" and
                                                i != "" and 
                                                i != ","] + ["OF EDUCATION"])
            dist_clean_2 = ' '.join(dist_clean_1.split())
            
            all_dist.append(dist_clean_2)
            
            return([all_dist[0]])
            




## Step one: read in data from scraping/cleaning step

In [15]:
all_cases_df = pd.read_pickle('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/all_cases_df_1026.p')

In [16]:
all_cases_df.defendant.value_counts()

Bad split                                                                                             22
  WASHINGTON TOWNSHIPBOARD OF EDUCATION,                                                               5
  JERSEY CITY BOARD OF EDUCATION                                                                       4
  WILLINGBORO TOWNSHIP BOARD OF  EDUCATION AND BURLINGTON COUNTY  SPECIAL SERVICES SCHOOL DISTRICT     4
  EAST ORANGE BOARD OF EDUCATION                                                                       4
 MIDDLETOWN TOWNSHIP  BOARD OF EDUCATION                                                               4
  PAULSBORO BOARD OF EDUCATION ANDJERSEY CITY BOARD OF EDUCATION,                                      3
  J.T.,                                                                                                3
  MILLBURN TOWNSHIP BOARD OF EDUCATION                                                                 3
  South Brunswick Board of Education                   

## Step two: cleaning

### 2.1: clean bad caption splits

In [17]:
## for non-NA caption
pd.set_option('display.max_colwidth', -1)
all_cases_df['why_bad_split'] = np.where(all_cases_df.caption.isnull(), 'no_caption', 
                                np.where((~all_cases_df.caption.isnull()) & 
                                        (all_cases_df.defendant == 'Bad split'), 'caption_badsplit',
                                        'caption'))

## for ones that have a caption and bad split, manually code
all_cases_df_goodsplit = all_cases_df.loc[all_cases_df.why_bad_split == 'caption'].copy()
all_cases_df_havecaption_badsplit = all_cases_df.loc[all_cases_df.why_bad_split == 'caption_badsplit'].copy()
all_cases_df_havecaption_badsplit.defendant[all_cases_df_havecaption_badsplit.docket == 'eds02189-02_1']  = 'TEWKSBURY TOWNSHIP BOARD OF EDUCATION'
all_cases_df_havecaption_badsplit.plaintiff[all_cases_df_havecaption_badsplit.docket == 'eds02189-02_1']  == 'G.R. O/B/O L.R.'
all_cases_df_havecaption_badsplit.plaintiff[all_cases_df_havecaption_badsplit.docket == 'eds03684-04_1']  = 'UNION TOWNSHIP BOARD OF EDUCATION'
all_cases_df_havecaption_badsplit.defendant[all_cases_df_havecaption_badsplit.docket == 'eds03684-04_1']  == 'L. J. O/B/O N. M.'
all_cases_df_havecaption_badsplit.why_bad_split[all_cases_df_havecaption_badsplit.docket == 'eds05088-05_1']  == 'no_caption'
all_cases_df_havecaption_badsplit.defendant[all_cases_df_havecaption_badsplit.docket == 'eds08699-03_1']  = 'NORTH PLAINFIELD BOARD OF EDUCATION'
all_cases_df_havecaption_badsplit.plaintiff[all_cases_df_havecaption_badsplit.docket == 'eds08699-03_1']  = 'Unknown O/B/O'
all_cases_df_havecaption_badsplit.why_bad_split[all_cases_df_havecaption_badsplit.docket == 'eds1528-01_1']  = 'not_IDEA_case'
all_cases_df_havecaption_badsplit.plaintiff[all_cases_df_havecaption_badsplit.docket == 'eds5858-97']  = 'STILLWATER BOARD OF EDUCATION'
all_cases_df_havecaption_badsplit.defendant[all_cases_df_havecaption_badsplit.docket == 'eds5858-97']  = 'Unknown O/B/O'
all_cases_df_havecaption_badsplit.plaintiff[all_cases_df_havecaption_badsplit.docket == 'eds5858-97']  = 'STILLWATER BOARD OF EDUCATION'
all_cases_df_havecaption_badsplit.plaintiff[all_cases_df_havecaption_badsplit.docket == 'eds10747-14_1']  = 'R.S. AND K.S. ON BEHALF OF J.S.'
all_cases_df_havecaption_badsplit.defendant[all_cases_df_havecaption_badsplit.docket == 'eds10747-14_1']  = 'MANALAPAN-ENGLISHTOWN REGIONAL BOARD OF EDUCATION'

## re-merge
all_cases_df_havecaption_badsplit_tobind = all_cases_df_havecaption_badsplit.loc[~all_cases_df_havecaption_badsplit.why_bad_split.isin(['no_caption',
                                                                                                        'not_IDEA_case'])]

## set of cases
all_cases_df_analytic = pd.concat([all_cases_df_goodsplit, 
                                  all_cases_df_havecaption_badsplit_tobind],
                                 axis = 0)



0    False
Name: plaintiff, dtype: bool

0    False
Name: defendant, dtype: bool

0    False
Name: why_bad_split, dtype: bool

### 2.2 clean multiple defendants 


In [21]:
## make everything all caps and then check if has multiple board of education
all_cases_df_analytic.plaintiff = all_cases_df_analytic.plaintiff.str.upper()
all_cases_df_analytic.defendant = all_cases_df_analytic.defendant.str.upper()
all_cases_df_analytic['count_districts_plaintiff'] = all_cases_df_analytic.plaintiff.str.count('BOARD(\\s)?(OF)?(\\s)?EDUCATION')
all_cases_df_analytic['count_districts_defendant'] = all_cases_df_analytic.defendant.str.count('BOARD(\\s)?(OF)?(\\s)?EDUCATION')

cleaned_defendants = all_cases_df_analytic.apply(extract_clean_defendants, 
                                                                    axis = 1)
cleaned_plaintiffs = all_cases_df_analytic.apply(extract_clean_plaintiffs, 
                                                                    axis = 1)
cleaned_defendants_list = cleaned_defendants.tolist()
cleaned_plaintiffs_list = cleaned_plaintiffs.tolist()

In [22]:
all_cases_df_analytic['clean_defendant_1'] = [i[0] for i in cleaned_defendants_list]
all_cases_df_analytic['clean_defendant_2'] = [i[1] for i in cleaned_defendants_list]
all_cases_df_analytic['clean_defendant_3'] = [i[2] for i in cleaned_defendants_list]
all_cases_df_analytic['clean_plaintiff_1'] = [i[0] for i in cleaned_plaintiffs_list]


### 2.3: merge defendants and case captions with complaint metadata based on caption and case



In [23]:
## what to load: 
## 1. raw captions + 
## 2. cleaned data
case_metadata_raw = pd.read_csv('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/cleaned/cleaned_casecaptions_20181211.csv')
district_dem_raw =  pd.read_csv('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/cleaned/dist_analytic_4spatialproj.csv')
district_dem_raw[['nces_name', 'doe_name', 'AgencyName']].head()

## look at intersection with case metadata
case_metadata_names = set(case_metadata_raw.match_in_df.unique())
nces_names = set(district_dem_raw.nces_name.unique())
district_universe = list(set(case_metadata_names).union(nces_names))


Unnamed: 0,nces_name,doe_name,AgencyName
0,SCH DIST OF THE CHATHAMS,SCH DIST OF THE CHATHAMS,THE SCHOOL DISTRICT OF THE CHATHAMS
1,GREAT MEADOWS REGIONAL,GREAT MEADOWS REGIONAL,GREAT MEADOWS REGIONAL SCHOOL DISTRICT
2,SOMERSET HILLS REGIONAL,SOMERSET HILLS REGIONAL,SOMERSET HILLS REGIONAL SCHOOL DISTRICT
3,OCEANSIDE CHARTER SCHOOL,OCEANSIDE CS,OCEANSIDE CHARTER SCHOOL
4,PLEASANTECH ACADEMY CHART,PLEASANTECH ACADEMY CS,PLEASANTECH ACADEMY CHARTER


In [24]:
case_metadata_raw.columns = ['metadata_docket_noprefix', 'metadata_caption', 'metadata_status', 'metadata_decision', 
                             'metadata_springschoolyear_opened',
                            'metadata_springschoolyear_closed', 
                            'metadata_district_name', 
                            'metadata_LEAID', 
                            'metadata_parentdefendant']



In [25]:
all_cases_df_analytic_tomatch = all_cases_df_analytic[['caption', 'summary', 'docket',
                                                       'count_districts_defendant',
                                                      'clean_defendant_1', 
                                                      'clean_defendant_2',
                                                      'clean_defendant_3',
                                                      'clean_plaintiff_1']].copy()

## 931 to match

In [26]:
## different approach: treat metadata district names as universe of districts and do partial match
### use different universe: union of case_metadata districts and districts in analytic sample

## for remaining non-parent defendants, find closest match
dist_pattern_toremove = "BOARD(OF)?(\\s+)?(OF)?(\\s+)?EDUCATION(\\s+)?|SCHOOL(\\s+)?DISTRICT(\\s+)?|\\s+$"
all_cases_df_analytic_tomatch['clean_defendant_1_formatch'] = all_cases_df_analytic_tomatch.clean_defendant_1.str.replace(dist_pattern_toremove, "")
all_cases_df_analytic_tomatch['clean_defendant_2_formatch'] = all_cases_df_analytic_tomatch.clean_defendant_2.str.replace(dist_pattern_toremove, "") 
all_cases_df_analytic_tomatch['clean_defendant_3_formatch'] = all_cases_df_analytic_tomatch.clean_defendant_3.str.replace(dist_pattern_toremove, "")
all_cases_df_analytic_tomatch['clean_plaintiff_formatch'] = all_cases_df_analytic_tomatch.clean_plaintiff_1.str.replace(dist_pattern_toremove, "")




In [27]:
districts_fromcases_clean = all_cases_df_analytic_tomatch.clean_defendant_1_formatch
districts_fromcases_clean_def2 = all_cases_df_analytic_tomatch.clean_defendant_2_formatch
districts_fromcases_clean_def3 = all_cases_df_analytic_tomatch.clean_defendant_3_formatch
districts_fromcases_clean_plaint = all_cases_df_analytic_tomatch.clean_plaintiff_formatch


In [28]:
## find closest matches
district_match = [process.extractBests(x, district_universe, 
                scorer = fuzz.partial_token_sort_ratio, score_cutoff = 30) for x in districts_fromcases_clean]





In [29]:
district_match_def2 = [process.extractBests(x, district_universe, 
                scorer = fuzz.partial_token_sort_ratio, score_cutoff = 30)
                if x is not None 
                else [('None', 'None')]
                for x in districts_fromcases_clean_def2]


In [30]:
district_match_def3 = [process.extractBests(x, district_universe, 
                scorer = fuzz.partial_token_sort_ratio, score_cutoff = 30) 
                if x is not None 
                else [('None', 'None')]
                for x in districts_fromcases_clean_def3]

In [31]:
district_match_plaintiff = [process.extractBests(x, district_universe, 
                scorer = fuzz.partial_token_sort_ratio, score_cutoff = 30) 
                if x is not "parent" 
                else [('parent', 'parent')] 
                for x in districts_fromcases_clean_plaint]




In [32]:
first_match = [item[0][0] if len(item) > 0 else None for item in district_match]
second_match = [item[1][0] if len(item) > 0 else None for item in district_match]
third_match = [item[2][0] if len(item) > 0 else None for item in district_match]
first_match_def2 = [item[0][0] if len(item) > 0 else None for item in district_match_def2]
first_match_def3 = [item[0][0] if len(item) > 0 else None for item in district_match_def3]
first_match_plaintiff = [item[0][0] if len(item) > 0 else None for item in district_match_plaintiff]


## bind into a dataframe
districts_withmatches = pd.DataFrame({'original_district' : districts_fromcases_clean,
 'first_match' : first_match,
 'second_match': second_match,
'third_match': third_match,
'first_match_def2': first_match_def2,
'first_match_def3': first_match_def3,
'first_match_plaintiff': first_match_plaintiff,                           
  })


## sort by original_district and write to csv
districts_withmatches_tomerge = districts_withmatches.drop_duplicates()


## write to intermediate file to check


In [33]:
## merge with original
districts_withmatches_tocheck_moreinfo = pd.merge(districts_withmatches_tomerge[['original_district', 
                                                                                'first_match',
                                                                                'second_match',
                                                                                'third_match',
                                                                                'first_match_def2',
                                                                                'first_match_def3',
                                                                                'first_match_plaintiff']],
                                                 all_cases_df_analytic_tomatch[['caption',
                                                                                'docket',
                                                                    'clean_defendant_1_formatch',
                                                                    'clean_defendant_1',
                                                                    'clean_defendant_2_formatch',
                                                                    'clean_defendant_3_formatch',
                                                                    'clean_plaintiff_formatch']],
                                                 left_on = 'original_district',
                                                 right_on = 'clean_defendant_1_formatch',
                                                 how = 'inner')


districts_withmatches_tocheck_moreinfo.shape


(965, 14)

#### 2.3.1: clean the matches for the primary district defendant


In [34]:
## #districts_withmatches_tocheck_moreinfo.to_csv('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/hearingmatches_tocheck.csv')


## read in
cleaned_matches = pd.read_csv('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/hearingmatches_tocheck_rjedits20181212.csv',
                            encoding='latin1')
cleaned_matches.shape
cleaned_matches['final_defendant_1'] = np.where(cleaned_matches.best_match == 1,
                                            cleaned_matches.first_match,
                                    np.where(cleaned_matches.best_match == 2, 
                                            cleaned_matches.second_match,
                                            cleaned_matches.third_match))

cleaned_matches_tomerge = cleaned_matches[['docket', 'caption', 'final_defendant_1']].copy()

## next steps:
## 1. code to best match (np.where etc)
## 2. merge with case text on the basis of docket number


(635, 10)

#### 2.3.2: clean the matches for the other districts

In [35]:
## next steps
## 1. subset to ones with non-empty extra defendants and non parent plaintiffs
districts_withmatches_tocheck_additional = districts_withmatches_tocheck_moreinfo.loc[(districts_withmatches_tocheck_moreinfo.first_match_def2 != 'None') |
                                                        (districts_withmatches_tocheck_moreinfo.first_match_def3 != 'None') |
                                                    (districts_withmatches_tocheck_moreinfo.clean_plaintiff_formatch != 'parent'),
                                                    ['caption', 'clean_defendant_2_formatch',
                                                     'first_match_def2',
                                                        'clean_defendant_3_formatch',
                                                    'first_match_def3',
                                                        'clean_plaintiff_formatch',
                                                    'first_match_plaintiff',
                                                    'docket']]
districts_withmatches_tocheck_additional['final_defendant_2'] = districts_withmatches_tocheck_additional['first_match_def2']
districts_withmatches_tocheck_additional['final_defendant_3'] = districts_withmatches_tocheck_additional['first_match_def3']
districts_withmatches_tocheck_additional['final_plaintiff'] = districts_withmatches_tocheck_additional['first_match_plaintiff']

#districts_withmatches_tocheck_additional.to_csv('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/additionalmatches_tocheck_rjedits20181212.csv',
 #                                              index = False)

## read in manually checked districts
cleaned_matches_more = pd.read_csv('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/additionalmatches_tocheck_rjedits20181212.csv',
                            encoding='latin1')


In [36]:
cleaned_matches_more_tomerge = cleaned_matches_more[['docket', 'caption', 
                                    'final_defendant_2','final_defendant_3',
                                    'final_plaintiff']].drop_duplicates()

cleaned_matches_more_tomerge.head()

## next steps: subset to final defendant_2, 3, and plaintiff
## and then merge with the other cleaned ones after coding

Unnamed: 0,docket,caption,final_defendant_2,final_defendant_3,final_plaintiff
0,eds00021-05_1,"B.G. AND M.J. O/B/O D.G ., v. SOMERSET HILLS REGIONAL BOARD OF EDUCATION AND BEDMINSTER TOWNSHIP BOARD OF EDUCATION",BEDMINSTER TOWNSHIP,,parent
1,eds00727-04_1,"R.K. O/B/O S.K., v. SOMERSET HILLS REGIONAL SCHOOL DISTRICT",,,parent
2,eds00250-04_1,"WASHINGTON TOWNSHIP BOARD OF EDUCATION, v. H.B. ON BEHALF OF H.B.",,,WASHINGTON TOWNSHIP
3,eds02036-05_1,"WASHINGTON TOWNSHIP BOARD OF EDUCATION, v. H.B. ON BEHALF OF H.B.",,,WASHINGTON TOWNSHIP
4,eds00592-04_1,"OCEAN TOWNSHIP BOARD OF EDUCATION, v. J. E. AND T.B. ON BEHALF OF J. E.",,,OCEAN TOWNSHIP


#### 2.3.3: merge

In [37]:
docket_onedefendant = cleaned_matches_tomerge.docket.tolist()
docket_multipledefendants = cleaned_matches_more_tomerge.docket.tolist()
docket_inboth = list(set(docket_onedefendant).intersection(set(docket_multipledefendants)))

## ones with district as plaintiff
cleaned_matches_more_notinboth = cleaned_matches_more_tomerge.loc[(~cleaned_matches_more_tomerge.docket.isin(docket_inboth)) &
                                                                    (cleaned_matches_more_tomerge.docket != 'eds00727-04_1')].copy()
cleaned_matches_more_notinboth['final_defendant_1'] = 'parent'

## ones with district as defendant but multiple districts
cleaned_matches_more_inboth = cleaned_matches_more_tomerge.loc[(cleaned_matches_more_tomerge.docket.isin(docket_inboth)) |
                            (cleaned_matches_more_tomerge.docket == 'eds00727-04_1')].copy()

cleaned_matches_tomerge_inboth = cleaned_matches_tomerge[cleaned_matches_tomerge.docket.isin(docket_inboth)].copy()

## merge the ones that are in both
cleaned_matches_districtdefendants_torbind = pd.merge(cleaned_matches_more_inboth,
                                              cleaned_matches_tomerge_inboth[['docket', 'final_defendant_1']],
                                              on = 'docket')

## for the ones that are not in both, add extra columns
cleaned_matches_tomerge_notinboth = cleaned_matches_tomerge[~cleaned_matches_tomerge.docket.isin(docket_inboth)].copy()
cleaned_matches_tomerge_notinboth['final_defendant_2'] = 'None'
cleaned_matches_tomerge_notinboth['final_defendant_3'] = 'None'
cleaned_matches_tomerge_notinboth['final_plaintiff'] = 'parent'
cleaned_matches_more_notinboth.shape
cleaned_matches_tomerge_notinboth.shape
cleaned_matches_districtdefendants_torbind.shape
cleaned_matches_init = pd.concat([cleaned_matches_more_notinboth,
                               cleaned_matches_tomerge_notinboth,
                               cleaned_matches_districtdefendants_torbind]).drop_duplicates()
cleaned_matches_init.head()


(195, 6)

(607, 6)

(28, 6)

Unnamed: 0,caption,docket,final_defendant_1,final_defendant_2,final_defendant_3,final_plaintiff
2,"WASHINGTON TOWNSHIP BOARD OF EDUCATION, v. H.B. ON BEHALF OF H.B.",eds00250-04_1,parent,,,WASHINGTON TOWNSHIP
3,"WASHINGTON TOWNSHIP BOARD OF EDUCATION, v. H.B. ON BEHALF OF H.B.",eds02036-05_1,parent,,,WASHINGTON TOWNSHIP
4,"OCEAN TOWNSHIP BOARD OF EDUCATION, v. J. E. AND T.B. ON BEHALF OF J. E.",eds00592-04_1,parent,,,OCEAN TOWNSHIP
5,GLOUCESTER CITY BOARD OF EDUCATION v. D.H. O/B/O J.H.,eds00699-03_1,parent,,,GLOUCESTER CITY
6,GLOUCESTER CITY BOARD OF EDUCATION v. D.H. and T.H. o/b/o D.H.,eds00724-03_1,parent,,,GLOUCESTER CITY


In [38]:
## see which ids are missing from final dataset
missing_docket = set(all_cases_df_analytic.docket.tolist()).difference(cleaned_matches_init.docket.tolist())
missing_df = all_cases_df_analytic[all_cases_df_analytic.docket.isin(missing_docket)].copy()

## deal with those and the 100-ish still missing
## match 
missing_df['count_districts_plaintiff'] = np.where(missing_df.plaintiff.str.contains("BOARD|TOWNSHIP|DISTRICT"), 1, 0)
cleaned_plaintiffs = missing_df.apply(extract_clean_plaintiffs, axis = 1)
cleaned_plaintiffs_list = cleaned_plaintiffs.tolist()
cleaned_plaintiffs_list_updated = [['parent'] if x is None else x for x in cleaned_plaintiffs_list]
missing_df['clean_plaintiff_1'] = [i[0] for i in cleaned_plaintiffs_list_updated]
#missing_df[['caption', 'plaintiff', 'defendant', 'clean_defendant_1', 
 #           'clean_plaintiff_1', 'count_districts_plaintiff']]

## fuzzy matching
missing_df['clean_defendant_1_formatch'] = missing_df.clean_defendant_1.str.replace(dist_pattern_toremove, "")
missing_df['clean_plaintiff_formatch'] = missing_df.clean_plaintiff_1.str.replace(dist_pattern_toremove, "")

missing_district_match = [process.extractBests(x, district_universe, 
                scorer = fuzz.partial_token_sort_ratio, score_cutoff = 30) for x in missing_df.clean_defendant_1_formatch.tolist()]



In [39]:
missing_plaintiff_match = [process.extractBests(x, district_universe, 
                scorer = fuzz.partial_token_sort_ratio, score_cutoff = 30) 
                if x is not "parent" 
                else [('parent', 'parent')] 
                for x in missing_df.clean_plaintiff_formatch]

first_match = [item[0][0] if len(item) > 0 else None for item in missing_district_match]
first_plaintiff_match = [item[0][0] if len(item) > 0 else None for item in missing_plaintiff_match]



In [40]:
missing_df['preliminary_plaintiff'] = first_plaintiff_match
missing_df['preliminary_defendant'] = first_match
missing_df['preliminary_defendant'] = np.where(missing_df.preliminary_plaintiff != "parent",
                                              "parent", missing_df.preliminary_defendant)
missing_df_tomerge = missing_df[['caption', 'docket', 'preliminary_plaintiff', 'preliminary_defendant']].copy()
#missing_df_tomerge.to_csv('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/missingmatches_tocheck.csv',
 #                        index = False)
    
missing_df_clean = pd.read_csv('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/missingmatches_tocheck_rjedits20181212.csv')
missing_df_clean.columns = ['caption', 'docket', 'final_plaintiff', 
                           'final_defendant_1', 'final_defendant_2', 'final_defendant_3']
missing_df_clean.head()

Unnamed: 0,caption,docket,final_plaintiff,final_defendant_1,final_defendant_2,final_defendant_3
0,"D.C. AND J.P. O/B/O K.C., v. LAWRENCE TOWNSHIP BOARD OF EDUCATION",eds00050-04_1,parent,LAWRENCE TOWNSHIP,,
1,"D.B. AND M.B. O/B/O C.B., v. BERNARDS TOWNSHIP BOARD OF EDUCATION",eds00412-06_1,parent,BERNARDS TOWNSHIP,,
2,"L.U. O/B/O A.P., v. TOWNSHIP OF PEMBERTON BOARD OF EDUCATION",eds00566-05_1,parent,PEMBERTON TOWNSHIP,,
3,"L.U. O/B/O A.P., v. TOWNSHIP OF PEMBERTON BOARD OF EDUCATION",eds00566-05_3,parent,PEMBERTON TOWNSHIP,,
4,"R.K. O/B/O S.K., v. SOMERSET HILLS REGIONAL SCHOOL DISTRICT",eds00727-04_1,parent,SOMERSET HILLS REGIONAL,,


In [41]:
## merge with the remainder
all_cleaned_captions = pd.concat([missing_df_clean, cleaned_matches_init])

## now write to pickle
all_cleaned_captions.head()

otherinfo_tomerge = all_cases_df_analytic[['full_text', 'decided', 'summary', 'docket']]
all_cleaned_df = pd.merge(all_cleaned_captions, otherinfo_tomerge, on = 'docket', how = 'left')

## save as pickle
import pickle
pd.to_pickle(all_cleaned_df, "/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/cleaned_case_meta.p")


Unnamed: 0,caption,docket,final_defendant_1,final_defendant_2,final_defendant_3,final_plaintiff
0,"D.C. AND J.P. O/B/O K.C., v. LAWRENCE TOWNSHIP BOARD OF EDUCATION",eds00050-04_1,LAWRENCE TOWNSHIP,,,parent
1,"D.B. AND M.B. O/B/O C.B., v. BERNARDS TOWNSHIP BOARD OF EDUCATION",eds00412-06_1,BERNARDS TOWNSHIP,,,parent
2,"L.U. O/B/O A.P., v. TOWNSHIP OF PEMBERTON BOARD OF EDUCATION",eds00566-05_1,PEMBERTON TOWNSHIP,,,parent
3,"L.U. O/B/O A.P., v. TOWNSHIP OF PEMBERTON BOARD OF EDUCATION",eds00566-05_3,PEMBERTON TOWNSHIP,,,parent
4,"R.K. O/B/O S.K., v. SOMERSET HILLS REGIONAL SCHOOL DISTRICT",eds00727-04_1,SOMERSET HILLS REGIONAL,,,parent


#### 2.3.4: clean dates 


In [42]:
## convert from string to datetime
all_cases_df['decided_datetime'] = pd.to_datetime(all_cases_df.decided, yearfirst= True)
all_cases_df['year_decided'] = all_cases_df.decided_datetime.dt.year 
all_cases_df['month_decided'] = all_cases_df.decided_datetime.dt.month



#### 2.3.5 clean text

In [58]:
## load updated text data
new_fulltext = pd.read_pickle('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/all_cases_df_1218_updates.p')
all_cases_df = pd.read_pickle('/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/cleaned_case_meta.p')



In [59]:
## merge with other case information using left join
new_fulltext_and_docket = new_fulltext[['docket', 'full_text']]
select_columns = [column for column in all_cases_df.columns if column != 'full_text']
all_cases_df_newtext = pd.merge(all_cases_df[select_columns], 
                               new_fulltext_and_docket, on = "docket",
                               how = "left")



In [84]:
os.getcwd()

'/Users/raj2/Dropbox/dph_hearing_decisions/code'

In [85]:
## get number of words in each
all_case_text = all_cases_df_newtext.full_text.tolist()
n_words = [len(i.split()) for i in all_case_text]
all_cases_df_newtext['n_words'] = n_words

## write the text to intermediate
all_cases_df_newtext.to_csv("/Users/raj2/Dropbox/dph_hearing_decisions/data/newjersey/intermediate/cases_fulltextdataframe_20181218.csv", 
                            index = False)



## Step three: model and summarize (LDA)

In [None]:
## Step two: topic modeling

In [109]:
## add an indicator for whether it contains the phrase emergent relief
all_cases_df_newtext['emergent_relief'] = np.where(all_cases_df_newtext.full_text.str.contains('emergent relief'), 
                                                  1, 0)
all_cases_df_newtext.emergent_relief.value_counts()

## look at mean number of words by whether it's emergent relief
mean_words = all_cases_df_newtext[['n_words', 'emergent_relief']].groupby(['emergent_relief']).agg('mean')
mean_words

## plot the difference
stop_words = set(stopwords.words('english'))
## add punctuation and some application-specific words
## to stopword list
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', 
                   '(', ')', '[', ']', '{', '}', 'education','iep', 'student', 'school', 'district', 'special',
                  'parent', 'parents', 'petitioner', 'petitioners', 'petition', 'relief', 'respondent', 'board', 'request',
                  'would']) # 

from nltk.stem import PorterStemmer
porter = PorterStemmer()
import warnings
warnings.filterwarnings(action='once')
from gensim import corpora
import gensim

0    629
1    320
Name: emergent_relief, dtype: int64

Unnamed: 0_level_0,n_words
emergent_relief,Unnamed: 1_level_1
0,3843.349762
1,2394.03125


  return concat([self.open(f).read() for f in fileids])


{'!',
 '"',
 "'",
 '(',
 ')',
 ',',
 '.',
 ':',
 ';',
 '?',
 '[',
 ']',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'board',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'district',
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'education',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'iep',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',

In [102]:
## function to estimate LDA with certain # of topics
def clean_text_estimate_lda(text_list, n_topics):
    
    ## clean text 
    
    all_stemmed_text = []
    for i in range(0, len(text_list)):
        one_case_orig = text_list[i]
        one_case_orig_token = wordpunct_tokenize(one_case_orig)
        one_case_orig_token_lower = [token.lower() for token in one_case_orig_token]
        one_case_orig_ns = [i for i in one_case_orig_token_lower if i not in stop_words and len(i) > 3 and i.isalpha()]
        one_case_stemmed = [porter.stem(i) for i in one_case_orig_ns]
        all_stemmed_text.append(one_case_stemmed)
        
    ## estimate LDA
    dictionary = corpora.Dictionary(all_stemmed_text)
    corpus = [dictionary.doc2bow(text) for text in all_stemmed_text]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = n_topics, id2word=dictionary, passes=10,
                                          alpha = 'auto',
                                          per_word_topics = True)
    return(ldamodel)
    

In [110]:
## estimate with:
## > 100 words
## separately for parents and districts
## iterate through number of topics: 5, 10, 20, 30
district_defendant = all_cases_df_newtext.full_text[(all_cases_df_newtext.final_plaintiff == "parent") &
                                                   (all_cases_df_newtext.n_words > 100)].tolist()
parent_defendant = all_cases_df_newtext.full_text[(all_cases_df_newtext.final_plaintiff != "parent") & 
                                                 (all_cases_df_newtext.n_words > 100)].tolist()

## run and iterate over all topics
n_topics = [5, 10, 15, 30]
store_district_models = []
store_parent_models = []
for n in n_topics:
    one_parent_model = clean_text_estimate_lda(parent_defendant, n_topics = n)
    one_district_model = clean_text_estimate_lda(district_defendant, n_topics = n)
    store_district_models.append(one_district_model)
    store_parent_models.append(one_parent_model)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)


In [118]:
## export for use in STM in R

LdaModel(num_terms=6251, num_topics=5, decay=0.5, chunksize=2000)
(0, '0.016*"evalu" + 0.010*"child" + 0.008*"order" + 0.008*"placement" + 0.008*"program" + 0.008*"hear" + 0.007*"servic" + 0.007*"decis" + 0.006*"educ" + 0.006*"state"')
(1, '0.016*"behavior" + 0.012*"program" + 0.009*"teacher" + 0.009*"placement" + 0.008*"provid" + 0.008*"appropri" + 0.007*"classroom" + 0.007*"class" + 0.007*"educ" + 0.007*"servic"')
(2, '0.017*"brookfield" + 0.017*"threat" + 0.013*"drug" + 0.010*"placement" + 0.010*"distribut" + 0.010*"staff" + 0.009*"behavior" + 0.008*"alleg" + 0.008*"novemb" + 0.007*"kiernan"')
(3, '0.009*"child" + 0.008*"class" + 0.008*"placement" + 0.007*"need" + 0.007*"teacher" + 0.006*"bassman" + 0.006*"kitler" + 0.006*"student" + 0.006*"classroom" + 0.005*"program"')
(4, '0.011*"evalu" + 0.009*"provid" + 0.009*"educ" + 0.009*"author" + 0.008*"program" + 0.007*"servic" + 0.006*"read" + 0.006*"disabl" + 0.006*"order" + 0.006*"appropri"')
LdaModel(num_terms=6251, num_topics=10, dec

In [107]:
topics = test.print_topics(num_words = 10)
for topic in topics:
    print(topic)
    

(0, '0.008*"jersey" + 0.008*"matter" + 0.008*"decis" + 0.008*"hear" + 0.008*"parent" + 0.007*"placement" + 0.007*"date" + 0.007*"order" + 0.006*"program" + 0.006*"state"')
(1, '0.019*"program" + 0.008*"provid" + 0.008*"class" + 0.007*"year" + 0.007*"behavior" + 0.007*"teacher" + 0.007*"also" + 0.007*"skill" + 0.006*"appropri" + 0.006*"time"')
(2, '0.011*"grade" + 0.009*"year" + 0.008*"read" + 0.007*"class" + 0.007*"teacher" + 0.007*"parent" + 0.007*"evalu" + 0.006*"time" + 0.006*"report" + 0.006*"program"')
(3, '0.015*"provid" + 0.011*"program" + 0.011*"servic" + 0.009*"child" + 0.009*"educ" + 0.008*"placement" + 0.007*"requir" + 0.007*"parent" + 0.007*"state" + 0.007*"appropri"')
(4, '0.009*"date" + 0.008*"state" + 0.008*"provid" + 0.008*"servic" + 0.007*"placement" + 0.007*"educ" + 0.006*"meet" + 0.006*"child" + 0.006*"sinai" + 0.006*"parent"')


In [92]:
ldamodel.save('model_10topics_new1218.gensim')
topics = ldamodel.print_topics(num_words = 20)
for topic in topics:
    print(topic)

(0, '0.010*"program" + 0.009*"placement" + 0.008*"home" + 0.007*"emerg" + 0.007*"date" + 0.006*"provid" + 0.006*"jersey" + 0.006*"matter" + 0.006*"parent" + 0.006*"year" + 0.006*"time" + 0.006*"order" + 0.005*"servic" + 0.005*"state" + 0.005*"child" + 0.005*"decis" + 0.005*"hear" + 0.004*"case" + 0.004*"offic" + 0.004*"parti"')
(1, '0.009*"educ" + 0.008*"placement" + 0.008*"state" + 0.007*"provid" + 0.006*"program" + 0.006*"behavior" + 0.006*"year" + 0.006*"class" + 0.006*"servic" + 0.006*"disabl" + 0.005*"appropri" + 0.005*"child" + 0.005*"grade" + 0.005*"read" + 0.005*"requir" + 0.004*"meet" + 0.004*"student" + 0.004*"need" + 0.004*"also" + 0.004*"evalu"')
(2, '0.012*"evalu" + 0.012*"provid" + 0.010*"child" + 0.010*"program" + 0.010*"parent" + 0.010*"servic" + 0.009*"educ" + 0.008*"placement" + 0.008*"appropri" + 0.007*"year" + 0.007*"disabl" + 0.006*"hear" + 0.006*"decis" + 0.006*"state" + 0.006*"requir" + 0.006*"meet" + 0.005*"need" + 0.005*"determin" + 0.005*"matter" + 0.005*"team

In [93]:
### visualize
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

  return f(*args, **kwds)
  return f(*args, **kwds)


In [95]:
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)


  return matrix(data, dtype=dtype, copy=False)


KeyboardInterrupt: 