In [1]:
import lxml.html, io
import datetime
import re
import utils
import pandas as pd
import numpy as np
import urllib

In [2]:
df_bioguide = pd.read_csv("results/bioguide.csv")

# Functions

## Functions to extract information from individual biographies

In [3]:
def extract_birth_place(string):
    
    """
    Takes in a biography text and outputs 2 strings: birth county and state.
    """
    
    if type(string) is not str:
        
        return None
    
    else:

        # Exception for semi-colons
        string = string.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
        string = string.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
        string = string.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
        string = string.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
        string = string.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
        string = string.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")

        # Exception for 'born at, born near, born on'
        string = string.replace(' at ', ' in ')
        string = string.replace(' near ', ' in ')
        string = string.replace(' on ', ' in ')

        # Exception for New York City
        string = string.replace('New York City', 'New York City, N.Y.,')

        # Extract pattern based on birthdate
        pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
        match = re.search(pattern, string, re.I) 
        if not match or not match.group(1):
          # specifically detect cases that we can't handle to avoid unnecessary warnings
          if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN"
          if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN"
          return None
        sentence = match.group(0).strip()

        # Extract county information from sentence
        try:
            state = sentence.split(',')[-3].strip()

            county = sentence.split(',')[0].split(' in ')[-1].strip()
            county = re.sub(r'[^\w\s]','',county)

            # if contains now or formerly...
            if ' now ' in county:
                county = county.split(' now ')[1]
            elif ' formerly ' in county:
                county = county.split(' formerly ')[0]

            # remove words starting with lower case (for example removing 'now part of' in 'now part of New York City, N.Y.')
            county_sentence = county.split(' ')
            for word in county_sentence:
                if word[0].islower():
                    county_sentence.remove(word)
            ## concatenate back the string
            county = ' '.join(county_sentence)

            return county, state  
        
        except Exception as e:
            return None

In [4]:
def extract_lived_places(string):
    
    """
    Takes in a bioguide text and extract the places that the legislator lived preceeded by the phrase 'moved to'
    """
    
    if type(string) is not str:
        
        return None
    
    else:
    
        pattern = "moved to"
        string_list = string.split(';')
        matches = [string.strip() if pattern in string else None for string in string_list]
        matches = list(filter(None.__ne__, matches))
        places = []

        for match in matches:
            # dealing with cases containing the pattern "city, state.,"
            match = match.split('moved to')[1].strip()

            match = match.split('.,')[0].strip()

            match = match.split(' ')

            words = []
            for word in match:
                if not word.islower():
                    words.append(word)
                else:
                    break

            place = ' '.join(words)
            places.append(place)
        return places

In [5]:
def extract_secondary_schools(string, max_schools_allowed=3):
    
    """
    Takes in a piece of biography text and extracts secondary school data
    """

    clean_sentences = []
    public_school = 0
    
    # exception
    if type(string) is not str:
        
        clean_sentences.append(None)
        public_school = 0
        
    else:
    
        string = string.replace(' in ', ', ')
        
        # Exceptions
        patterns = ['graduated', 'attended', 'High School', 'public schools']
        excludes = ['College', 'University', 'Medical School', 'Law School', 'Divinity School', 
                    'Military Academy', 'Army', 'Naval Academy', 'Navy', 'Airforce Academy', 'Airforce', 'Conference',
                    'Institute of Technology', 'Nursing School', 'Military Institute', 'Graduate School', 'School of Mines',
                    'School of Engineering', 'Seminary', 'Polytechnic Institute', 'Universities', 'Universite', 'School of Law',
                    'School of Accounting', 'Finance',]

        string_list = string.split(';')

        # list to collect sentences that contains those patterns
        sentences = []

        # search for sentences in full bio
        for string in string_list:
            if any(x in string for x in patterns) and not any(y in string for y in excludes):
                sentences.append(string.strip())

        # process the string
        ## remove leading 'graduated from' and trailing year and comma
        for sentence in sentences:
            ## save public schools information
            if len(sentence.split('public schools')) > 1:
                public_school = 1

            ## remove leading 'graduated from'
            if 'graduated from' in sentence:
                sentence = sentence.split('graduated from')[1].strip()
            ## remove leading 'graduated'
            elif 'graduated' in sentence:
                sentence = sentence.split('graduated')[1].strip()
            ## remove leading 'attended'
            elif 'attended' in sentence:
                sentence = sentence.split('attended')[1].strip()

            ## remove years (numerics)
            sentence = ''.join([i for i in sentence if not i.isdigit()])

            ## remove trailing white spaces and comma
            sentence = sentence.strip().strip(',')

            ## remove any forward trailing words with lower case characters
            sentence = remove_forward_lower_case_words(sentence)

            ## remove 'public schools'
            sentence = sentence.split('public schools')[0].strip()

            ## split on 'and'
            sentence = sentence.split(' and ')

            for i in range(min(max_schools_allowed, len(sentence))):
                clean_sentences.append(sentence[i])

        # if found none matching sentences, append None
        if len(sentences) == 0:
            clean_sentences.append(None)

    return clean_sentences, public_school

sample = "graduated College of Great Falls 1978, farmer"

In [6]:
def remove_forward_lower_case_words(string):
    
    """
    helper method to remove trailing lower case words, used in extracting secondary school information
    """
    
    words = string.split(' ')
    pattern = []
    for word in words:
        if word.islower():
            pattern.append(word)
        else:
            break
    pattern = ' '.join(pattern)
    
    # if the entire string is lower case
    if string == pattern:
        string = ''
    elif len(pattern) > 0:
        string = ''.join(string.split(pattern)).strip()
    return string

## Functions taking in lists of bios and outputting lists

In [7]:
def main_birth_places(bios):

    """
    Takes in a list of bios and outputs a list of birthplaces and a list of lived places
    """
    
    birth_place_list = []
    birth_place_exception_list = []
    lived_place_list = []
    
    for bio in bios:

        birth_place = extract_birth_place(bio)
        birth_place_list.append(birth_place)
        if not birth_place:
            birth_place_exception_list.append(bio)

        lived_place = extract_lived_places(bio)
        lived_place_list.append(lived_place)

    return birth_place_list, lived_place_list 

In [8]:
def main_secondary_school(bios, max_schools_allowed=3):

    """
    Takes in a list of bios and outputs a list of secondary schools and a list of public school indicators
    """
    
    secondary_schools = []
    public_schools = []
    
    for bio in bios:

        secondary_school, public_school_indicator = extract_secondary_schools(bio, max_schools_allowed)

        if secondary_school[0] is not None and secondary_school[0] != '':
            if len(secondary_school) > 1:
                selected_school = None
                # prioritize the first mention containing 'High School': 
                for i in range(len(secondary_school)):
                    if 'High School' in secondary_school[i]:
                        selected_school = secondary_school[i]
                        break
                secondary_schools.append(selected_school)
            else:
                secondary_schools.append(secondary_school[0])
        else:
            secondary_schools.append(None)

        public_schools.append(public_school_indicator)
    
    return secondary_schools, public_schools

# Applying Functions

In [9]:
bios = df_bioguide["profileText"].to_numpy()
birth_places, lived_places = main_birth_places(bios)
secondary_schools, public_schools = main_secondary_school(bios)

  matches = list(filter(None.__ne__, matches))


## Converting into Dataframes

In [11]:
## put birth place data into dataframe
clean_birth_place_list = []
for birth_place in birth_places:
    clean_birth_place = []
    if birth_place is None or type(birth_place) == str:
        clean_birth_place.append(float('nan'))
        clean_birth_place.append(float('nan'))
    else:
        clean_birth_place = list(birth_place)
    clean_birth_place_list.append(clean_birth_place)
    
colnames = ['city/county', 'state/country']
birth_place_df = pd.DataFrame(clean_birth_place_list, columns = colnames)

In [12]:
birth_place_df.head()

Unnamed: 0,city/county,state/country
0,Fairfield,Iowa
1,Selma,Ala.
2,Omaha,Nebr.
3,Todd County,Ky.
4,Johnstown,Pa.


In [13]:
## put secondary school info into dataframe
copy_secondary_schools = list(secondary_schools)
secondary_schools_new = []
for school in copy_secondary_schools:
    if not school or len(school) == 0:
        secondary_schools_new.append(None)
    else:
        secondary_schools_new.append(school)
        
secondary_schools_df = pd.DataFrame(np.array([secondary_schools_new, public_schools]).T, columns=['secondary_school', 'public'])

In [14]:
secondary_schools_df.head()

Unnamed: 0,secondary_school,public
0,,0
1,"Selma High School, Selma, Ala.",0
2,,1
3,,0
4,,0


In [15]:
SCHOOL_KEYWORDS = ['School', 'Academy', 'Institute']

# Process secondary school list to extract school name, city, state
def split_secondary_school_information(secondary_schools):
    school_names = []
    cities = []
    states = []
    for string in secondary_schools:
        if type(string) is str:
            words = string.split(',')
            # Case: City, State
            if len(words) == 2 and not any(x in words[0] for x in SCHOOL_KEYWORDS):
                cities.append(words[0])
                states.append(words[1])
                school_names.append(None)
            # Case: School, City
            elif len(words) == 2 and any(x in words[0] for x in SCHOOL_KEYWORDS):
                school_names.append(words[0])
                cities.append(words[1])
                states.append(words[1])
            # Case: Only School, City, or State
            elif len(words) == 1:
                # Sub-Case: The only word is school
                if any(x in words[0] for x in SCHOOL_KEYWORDS):
                    school_names.append(words[0])
                    states.append(None)
                    cities.append(None)
                # Sub-Case: The only word is not school (so city or state)
                # TODO: add code to differentiate between City and States, i.e. to detect States
                else:
                    school_names.append(None)
                    states.append(words[0])
                    cities.append(words[0])
            # Easy case: School name, City, State (hopefully)
            else:
                school_names.append(words[0])
                states.append(words[-1])
                cities.append(words[-2])
        else:
            school_names.append(None)
            cities.append(None)
            states.append(None)
#     return pd.DataFrame([school_names, cities, states], columns=['school name', 'school city/county', 'school state/country'])
    school_names = [school.strip() if school else None for school in school_names]
    cities = [city.strip() if city else None for city in cities]
    states = [state.strip() if state else None for state in states]
    return pd.DataFrame(np.vstack([school_names, cities, states]).T, columns=['school name', 'school city/county', 'school state/country'])

secondary_schools_split_df = split_secondary_school_information(secondary_schools_df['secondary_school'])
secondary_schools_split_df['public school'] = secondary_schools_df['public']

In [16]:
colnames = ['city/county', 'state/country']
birth_place_df = pd.DataFrame(clean_birth_place_list, columns = colnames)

In [17]:
for city, state in zip(birth_place_df['city/county'], birth_place_df['state/country']):
    if type(city) == str and city == 'family':
        print(state)

Tenn.
Md.
Va. (now West Virginia)
N.C.
Md.


In [18]:
df_places = pd.concat([birth_place_df, secondary_schools_split_df], axis=1)

In [19]:
df_places

Unnamed: 0,city/county,state/country,school name,school city/county,school state/country,public school
0,Fairfield,Iowa,,,,0
1,Selma,Ala.,Selma High School,Selma,Ala.,0
2,Omaha,Nebr.,,,,1
3,Todd County,Ky.,,,,0
4,Johnstown,Pa.,,,,0
...,...,...,...,...,...,...
12962,Albemarle County,Va.,,,,0
12963,New London,Conn.,,,,0
12964,Brooklyn,N.Y.,,,,0
12965,Columbia County,Ga.,,,,0


In [20]:
# clean lower case in county/city
def clean_trailing_lower_cases(string):
    words = list(string.split(' '))
    for word in string.split(' '):
        if word.islower():
            words.remove(word)
        else:
            break
            
    return ' '.join(words)

    
# clean trailing quotes, commas, periods for lived_places
from string import punctuation

def clean(s):
    return clean_trailing_lower_cases(s).strip(punctuation)
    
clean_county = [clean(s) if type(s) is str else s for s in df_places['city/county']]
clean_states = [clean(s) if type(s) is str else s for s in df_places['state/country']]

df_places['city/county'] = clean_county
df_places['state/country'] = clean_states

In [21]:
df_new = pd.concat([df_bioguide, df_places], axis=1)

In [22]:
df_new.to_csv('results/bioguide_birth_places_schools.csv', index=False)