In [42]:
import lxml.html, io
import datetime
import re
import utils
import pandas as pd
import numpy as np
import urllib

In [43]:
df_bios = pd.read_csv("bioguide.csv").iloc[:100]
# filename = "bioguide.csv"

In [44]:
# copied from old file
# Takes in a bioguide text and extract birth place of the legislator
def extract_birth_place(string):    
    # Exception for semi-colons
    string = string.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
    string = string.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
    string = string.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
    string = string.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
    string = string.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
    string = string.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")
    
    # Exception for 'born at, born near, born on'
    string = string.replace(' at ', ' in ')
    string = string.replace(' near ', ' in ')
    string = string.replace(' on ', ' in ')
    
    # Exception for New York City
    string = string.replace('New York City', 'New York City, N.Y.,')
    
    # Extract pattern based on birthdate
    pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
    match = re.search(pattern, string, re.I) 
    if not match or not match.group(1):
      # specifically detect cases that we can't handle to avoid unnecessary warnings
      if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN"
      if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN"
      return None
    sentence = match.group(0).strip()
    
    # Extract county information from sentence
    try:
        state = sentence.split(',')[-3].strip()
    
        county = sentence.split(',')[0].split(' in ')[-1].strip()
        county = re.sub(r'[^\w\s]','',county)

        # if contains now or formerly...
        if ' now ' in county:
            county = county.split(' now ')[1]
        elif ' formerly ' in county:
            county = county.split(' formerly ')[0]
        
        # remove words starting with lower case (for example removing 'now part of' in 'now part of New York City, N.Y.')
        county_sentence = county.split(' ')
        for word in county_sentence:
            if word[0].islower():
                county_sentence.remove(word)
        ## concatenate back the string
        county = ' '.join(county_sentence)
        
        return county, state  
        
    except Exception as e:
        return None

In [45]:
# copied from old file
# Takes in a bioguide text and extract the places that the legislator lived preceeded by the phrase 'moved to'
def extract_lived_places(string):
    pattern = "moved to"
    string_list = string.split(';')
    matches = [string.strip() if pattern in string else None for string in string_list]
    matches = list(filter(None.__ne__, matches))
    places = []
    
    for match in matches:
        # dealing with cases containing the pattern "city, state.,"
        match = match.split('moved to')[1].strip()
        
        match = match.split('.,')[0].strip()
        
        match = match.split(' ')
        
        words = []
        for word in match:
            if not word.islower():
                words.append(word)
            else:
                break
        #########################################################
        
        place = ' '.join(words)
        places.append(place)
    return places

In [46]:
# copied from old file
# Takes in a bioguide text and extracts secondary school data
def extract_secondary_schools(string, max_schools_allowed=3):
    # Exceptions
    string = string.replace(' in ', ', ')
    
    patterns = ['graduated', 'attended', 'High School', 'public schools']
    excludes = ['College', 'University', 'Medical School', 'Law School', 'Divinity School', 
                'Military Academy', 'Army', 'Naval Academy', 'Navy', 'Airforce Academy', 'Airforce', 'Conference',
                'Institute of Technology', 'Nursing School', 'Military Institute', 'Graduate School', 'School of Mines',
                'School of Engineering', 'Seminary', 'Polytechnic Institute', 'Universities', 'Universite', 'School of Law',
                'School of Accounting', 'Finance',]

    string_list = string.split(';')
    
    # list to collect sentences that contains those patterns
    sentences = []
    
    # search for sentences in full bio
    for string in string_list:
        if any(x in string for x in patterns) and not any(y in string for y in excludes):
            sentences.append(string.strip())
    
    # process the string
    clean_sentences = []
    public_school = 0
    ## remove leading 'graduated from' and trailing year and comma
    for sentence in sentences:
        ## save public schools information
        if len(sentence.split('public schools')) > 1:
            public_school = 1
        
        ## remove leading 'graduated from'
        if 'graduated from' in sentence:
            sentence = sentence.split('graduated from')[1].strip()
        ## remove leading 'graduated'
        elif 'graduated' in sentence:
            sentence = sentence.split('graduated')[1].strip()
        ## remove leading 'attended'
        elif 'attended' in sentence:
            sentence = sentence.split('attended')[1].strip()
        
        ## remove years (numerics)
        sentence = ''.join([i for i in sentence if not i.isdigit()])
        
        ## remove trailing white spaces and comma
        sentence = sentence.strip().strip(',')
        
        ## remove any forward trailing words with lower case characters
        sentence = remove_forward_lower_case_words(sentence)
            
        ## remove 'public schools'
        sentence = sentence.split('public schools')[0].strip()
        
        ## split on 'and'
        sentence = sentence.split(' and ')
        
        for i in range(min(max_schools_allowed, len(sentence))):
            clean_sentences.append(sentence[i])
        
    # if found none matching sentences, append None
    if len(sentences) == 0:
        clean_sentences.append(None)

    return clean_sentences, public_school

sample = "graduated College of Great Falls 1978, farmer"

In [47]:
# copied from old file
# helper method to remove trailing lower case words, used in extracting secondary school information
def remove_forward_lower_case_words(string):
    words = string.split(' ')
    pattern = []
    for word in words:
        if word.islower():
            pattern.append(word)
        else:
            break
    pattern = ' '.join(pattern)
    
    # if the entire string is lower case
    if string == pattern:
        string = ''
    elif len(pattern) > 0:
        string = ''.join(string.split(pattern)).strip()
    return string

In [48]:
bios = df_bios["profileText"]
birth_place_list = []
for bio in bios:
    birth_place = extract_birth_place(bio)
    birth_place_list.append(birth_place)
birth_place_list

[('Fairfield', 'Iowa'),
 ('Selma', 'Ala.'),
 ('Omaha', 'Nebr.'),
 ('Todd County', 'Ky.'),
 ('Johnstown', 'Pa.'),
 None,
 ('Mecca Township', 'Ohio'),
 ('Warrenton', 'Mo.'),
 ('Columbus', 'Ohio'),
 ('Greenfield', 'Va.'),
 ('Albuquerque', 'N. Mex.'),
 ('Peoria', 'Ill.'),
 ('Rockingham County', 'Va.'),
 ('McKees Rocks', 'Pa.'),
 ('Georgetown', 'Del.'),
 ('Los Angeles', 'Calif.'),
 ('Brosville', 'Va.'),
 ('East Haddam', 'Conn.'),
 ('New Haven', 'Conn.'),
 ('Hallidays Cove', 'Pa.'),
 ('Charlton', 'N.Y.'),
 ('Petersburg', 'Ind.'),
 ('Worcester', 'Mass.'),
 ('Midland City', 'Ala.'),
 ('Chicago', 'Ill.'),
 ('Colchester', 'N.Y.'),
 None,
 ('Wye Hall', 'Md.'),
 ('Evansville', 'February'),
 ('Lebanon', 'Conn.'),
 ('New Haven', 'Conn.'),
 ('Cedartown', 'Ga.'),
 ('Portsmouth', 'N.H.'),
 ('Shelby', 'N.C.'),
 ('Franklin', 'Tenn.'),
 ('Nelson County', 'Va.'),
 ('Kingstree', 'S.C.'),
 ('Woodland', 'Pa.'),
 ('Point Peninsula', 'N.Y.'),
 ('Harrodsburg', 'Ky.'),
 ('Walnut Hill', 'Pa.'),
 ('Butte', 'Mont.')

In [49]:
def main_birth_places():    
    
#     df_bios = pd.read_csv(filename)
    bios = df_bios["profileText"]
    
    birth_place_list = []
    birth_place_exception_list = []
    lived_place_list = []
    
    for bio in bios:

        bio = bio.strip().replace("\n", " ").replace("\r", " ")

        ####################### Extract county information #######################################################

        birth_place = extract_birth_place(bio)
        birth_place_list.append(birth_place)
        if not birth_place:
            birth_place_exception_list.append(bio)

        lived_place = extract_lived_places(bio)
        lived_place_list.append(lived_place)

    return birth_place_list, lived_place_list 

In [53]:
birth_places, lived_places = main_birth_places()

  matches = list(filter(None.__ne__, matches))


[['Jewell County, Kans'],
 [],
 ['Grand Rapids, Mich'],
 ['Texas'],
 [],
 ['Vallonia, Ind'],
 ['Shreveport, La.'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Georgetown, Del'],
 [],
 [],
 ['Saginaw, Mich'],
 [],
 ['Ohio', 'Tuscarawas County'],
 ['Cleveland, Ohio,'],
 [],
 ['Worcester'],
 [],
 [],
 ['Green Lake County, Wis', 'Oshkosh, Wis'],
 [],
 [],
 [],
 ['Fall River, Mass'],
 [],
 [],
 ['Newburyport, Mass', 'Boston, Mass', 'Tewksbury, Middlesex County, Mass'],
 [],
 [],
 ['Springfield, Ohio,', 'Kalispell, Mont'],
 [],
 ['Clearfield, Pa'],
 ['Wisconsin'],
 ['New York City'],
 [],
 [],
 [],
 [],
 ['Blue Springs, Gage County, Nebr'],
 [],
 ['New Orleans', 'San Francisco, Calif'],
 ['Carrollton, Ky', 'Baltimore'],
 [],
 [],
 ['Bellows Falls, Vt'],
 [],
 [],
 ['Erie, Pa'],
 [],
 ['Philadelphia, Pa'],
 [],
 [],
 [],
 ['Delaware County, N.Y'],
 ['New York', 'Saratoga Springs, Saratoga County, N.Y'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['Brookville, Ind', 'Cambridge City, Ind'],
 ['Scottsbluff, Nebr'