This file contains a bunch of function to clean up the most important columns in the master, in order to make matching easier. The functions might also be useful for adding new files to the master. 

In [2]:
import numpy as np
import pandas as pd
import re
import os

In [131]:
import textdistance

In [9]:
master = pd.read_excel("current_master_1_26_21.xlsx")

In [10]:
master.head()

Unnamed: 0,unique_id,source,Census.Year,State/Province,County,Place,unsure_ids,Household Joint ID,Joint ID for Matched Records,Last.Name,...,Male birth,Female birth,Schoolm,Schoolf,Notes,Year of this Record,Last Name MATCH,First Name Match,Census Year Match,Total of Matches
0,1,,1861.0,Ontario,Toronto,Toronto - St Patricks Ward,[],30.0,,Wilson,...,,,,,,,,,,
1,2,9247,1900.0,ME,WASHINGTON,MACHIAS,[],,1.0,ADDISON,...,,,,,,,0.0,0.0,0.0,0.0
2,3,,1864.0,CanadaWest,Essex,Windsor,[],,1.0,Addison,...,,,,,,,0.0,0.0,0.0,0.0
3,4,1880 IPUMS 100% sample,1880.0,MI,WAYNE,DETROIT,[],133.0,2.0,WEEKS,...,,,,,,,,,,
4,5,,1864.0,CanadaWest,Essex,Windsor,[],133.0,2.0,Weeks,...,,,,,,,,,,


In [20]:
# helper functions

# returns true if number or a type that can be coerced into a number, like a string of "5"
def is_number(num):
    if num is None:
        return False
    if pd.isna(num):
        return False
    try:
        float(num)
        return True
    except ValueError:
        return False

In [231]:
# use for census year and birth year

def proc_year(year):
    #year = row[year_col]
    if year is None or pd.isna(year) or len(str(year)) == 0:
        return np.nan
    else:
        # finds all four digit numbers that start with 1 or 2 
        parsed = re.findall("[1-2][0-9][0-9][0-9](?![0-9])", str(year))
        if len(parsed) == 1:
            return int(parsed[0])
        else:
            return np.nan

In [119]:
# master['Census.Year'] = master['Census.Year'].apply(proc_year)
# master['CalculatedBirthYear'] = master['CalculatedBirthYear'].apply(proc_year)

In [143]:
# use for state/province

"""since it seems like abbreviations were more common to begin with, I transform 
all states and provinces using
dictionaries from https://gist.github.com/rogerallen/1583593 and 
https://gist.github.com/jakeloredo/36f98fbbfa7d1b123269e99265d0ceca into abbreviations"""

us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    
    'Canada West': 'ON',
    'Alberta': 'AB',
    'British Columbia': 'BC',
    'Manitoba': 'MB',
    'New Brunswick': 'NB',
    'Newfoundland and Labrador': 'NL',
    'Northwest Territories': 'NT',
    'Nova Scotia': 'NS',
    'Nunavut': 'NU',
    'Ontario': 'ON',
    'Prince Edward Island': 'PE',
    'Quebec': 'QC',
    'Saskatchewan': 'SK',
    'Yukon': 'YT'
}

def proc_state(state):
    if state is None or pd.isna(state) or len(str(state)) == 0:
        return np.nan
    else:
        state = str(state)
        if len(state) <=2:
            return(state)
        elif state in us_state_abbrev:
            return us_state_abbrev[state]
        elif 'Canada' in state and 'West' in state:
            return 'ON'
        # account for some typos
        else:  
            for state_i in us_state_abbrev:
                if textdistance.levenshtein(state, state_i) <= 2:
                    return(us_state_abbrev[state_i])
    return(state)

In [144]:
# master['State/Province'] = master['State/Province'].apply(proc_state)

array(['ON', 'ME', 'MI', 'MA', 'VT', 'PA', 'IL', nan, 'OH', 'DC', 'NY',
       'WI', 'MD', 'IN', 'KS', 'MN', 'AL', 'CA', 'VA', 'KY', 'NE', 'NH',
       'CT', 'TX', 'DE', 'NJ', 'MO', 'WA', 'NC', 'AR', 'RI', 'GA', 'CO',
       'WV', 'NV', 'FL', 'IA', 'MS', 'MT', 'SD', 'TN', 'OR', 'LA', 'ID',
       'AZ', 'SC', 'HI', 'OK', 'NM', 'WY', 'PANA', 'ND', 'VI', 'toledo',
       'MIL', 'AK', 'UT', 'PR', 'ITER', 'NB', 'BC'], dtype=object)

In [159]:
# general function for removing special characters and numbers + making everything lowercase
# useful for county, place, names 
def proc_word(word):
    if word is None or pd.isna(word) or len(str(word)) == 0:
        return np.nan
    else:
        word = str(word)
        word = word.lower()
        # specifically there is a pattern of "3-wdboston" for example, so here i remove that wd
        word = re.sub("[0-9]+-wd", "", word)
        word = re.sub(r"[^a-zA-Z ]", "", word)
        return(word)

In [None]:
# master['County'] = master['County'].apply(proc_word)
# master['Place'] = master['Place'].apply(proc_word)
# master['First.Name'] = master['First.Name'].apply(proc_word)
# master['Last.Name'] = master['Last.Name'].apply(proc_word)

In [187]:
# age: verify that it is a number, and turn things in terms of months or weeks into a numeric value
def proc_age(age):
    if age is None or pd.isna(age) or len(str(age)) == 0:
        return np.nan
    elif is_number(age):
        age = float(age)
        if age > 110 or age < 0:
            return(np.nan)
        else:
            return(age)
    else:
        age = str(age).lower()
        if "weeks" in age or "week" in age:
            parsed = re.findall("[0-9]+", age)
            if len(parsed) == 1:
                return(float(parsed[0])/52)
            else:
                return np.nan
        elif "months" in age or "month" in age:
            parsed = re.findall("[0-9]+", age)
            if len(parsed) == 1:
                return(float(parsed[0])/12)
            else:
                return np.nan
        return np.nan
            

In [None]:
#master['Age'] = master['Age'].apply(proc_age)

In [196]:
def proc_gender(gender):
    if gender is None or pd.isna(gender) or len(str(gender)) == 0:
        return np.nan
    else:
        gender = proc_word(gender)
        if gender in ["f", "female", "femail", "woman"]:
            return("Female")
        elif gender in ["m", "male", "man", "mail"]:
            return("Male")
        else:
            return(np.nan)
        
# also see Gender.Rmd in github for assigning gender if unknown 

In [197]:
#master['Sex'] = master['Sex'].apply(proc_gender)

array(['F', 'M', nan], dtype=object)

In [217]:
# encode race with M for mulatto, W for white, and B for black 
race_dict = {'mulatto(blackandwhite)': 'M',
                 'm(wonancestry.com)': 'M',
                 'mulatto': 'M',
                 'mullato': 'M',
                 'm': 'M',
                 'm(winancestry.com)': 'M',
                 'black': 'B',
                 'b': 'B',
                 'blk': 'B',
                 'african': 'B',
                 'dark': 'B',
                 'drk': 'B',
                 'african (black)': 'B',
                 '“negro”': 'B',
                 'negro': 'B',
                 'blacj': 'B', 
                 'bkj': 'B', 
                 'white': 'W',
                 'w': 'W',
                 '[w]': 'W',
                 'white': 'W',
                 'white in black household': 'W',
                 'white but passing': 'W',
                 'ancestrysaysw': 'W'}

def proc_race(race):    
    if race is None or pd.isna(race) or len(str(race)) == 0:
        return np.nan
    else:
        race = proc_word(race)
        if race in race_dict:
            return(race_dict[race])
        elif "mulatto" in race or "mullatto" in race or "mullato" in race or "mixed" in race or race == "m" or ("black" in race and "white" in race):
            return("M")
        elif "white" in race or race == "w":
            return("W")
        elif "black" in race or "blk" in race or "blac" in race or "col" in race or "negro" in race or "african" in race or race == "b" or race == "c":
            return("B")
        else:
            return(race)

In [None]:
master['Color..Race.or.Ethnicity'] = master['Color..Race.or.Ethnicity'].apply(proc_race)

In [None]:
# place of birth: basically just update state fxn to include countries, might not be necessary tho

In [None]:
# code for filling in census year

In [None]:
# middle name

In [None]:
# filling in "unknown"