# 1. Dataframe Preprocessing

In [None]:
import pandas as pd
!pip3 install recordlinkage
import recordlinkage as rl
from recordlinkage.index import Block
from recordlinkage.preprocessing import clean
from recordlinkage.preprocessing import phonetic

# Regular expression operations
import re

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# parse file
root = "drive/My Drive/Colab Notebooks/DFG Arenson"
df = pd.read_excel(root + "/current_master_1_26_21.xlsx")

In [None]:
# current match numerations
lm_family = 134
lm_ind = 2466
im_uid = 60734

In [None]:
# rename columns
df = df.rename(columns = {# Personal Information
                                       'Last.Name': 'last_name',
                                       'First.Name': 'first_name', 
                                       # Places or Geography
                                       'State/Province': 'state_or_province', 
                                       'County': 'county', 
                                       'Place': 'place', 
                                       'WARD': 'ward', 
                                       'STREET': 'street', 
                                       'PLACEOFBIRTH': 'place_of_birth',
                                       'ROLL or Sheet#': 'roll_or_sheet',
                                       # Years
                                       'Census.Year': 'census_year', 
                                       'CalculatedBirthYear': 'calculated_birth_year', 
                                       # Personal information
                                       'Sex': 'sex', 
                                       'Color..Race.or.Ethnicity': 'race',
                                       'MARITAL': 'marital_status',
                                       'PROFESSION': 'profession',
                                       'Notable': 'notable',
                                       'RELIGION': 'religion'})

In [None]:
# master info
df.info()

In [None]:
# chosen columns
chosen_columns = ['last_name', 
                  'first_name',
                  'state_or_province',
                  'county',
                  'place',
                  'ward',
                  'street',
                  'place_of_birth',
                  'roll_or_sheet',
                  'lat',
                  'long',
                  'census_year',
                  'calculated_birth_year',
                  'sex',
                  'race',
                  'marital_status',
                  'profession',
                  'religion',
                  'unique_id',
                  'phonetic_name']

# 2. Standardization/normalization

In [None]:
import numpy as np

In [None]:
#name, phonetic
column_to_parse = ['last_name', 'first_name']

for c in column_to_parse:
  df[c] = df.apply(lambda row: str(row[c]).lower(), axis = 1)

df["phonetic_name"] = phonetic(df['last_name'], 'soundex')+phonetic(df['first_name'], 'soundex')

In [None]:
# race
def transform_race(row):    
    
    if pd.notnull(row['race']):
        row['race'] = row['race'].lower()
        row['race'] = row['race'].replace('\xa0', '')
        row['race'] = row['race'].replace('“', '')
        row['race'] = row['race'].replace('”', '')
    
    race_dict = {'mulatto(blackandwhite)': 'MIXED',
                 'm(wonancestry.com)': 'MIXED',
                 'mulatto': 'MIXED',
                 'mullato': 'MIXED',
                 'm': 'MIXED',
                 'm(winancestry.com)': 'MIXED',
                 'black': 'BLACK',
                 'b': 'BLACK',
                 'blk': 'BLACK',
                 'brown': 'BLACK',
                 'african': 'BLACK',
                 'dark': 'BLACK',
                 'drk': 'BLACK',
                 'african (black)': 'BLACK',
                 'negro': 'BLACK',
                 'blacj': 'BLACK', 
                 'bkj': 'BLACK', 
                 'light': 'WHITE',
                 'white': 'WHITE',
                 'w': 'WHITE',
                 '[w]': 'WHITE',
                 'white': 'WHITE',
                 'seems to be white': 'WHITE',
                 'white in black household': 'WHITE',
                 'white but passing': 'WHITE',
                 'ancestrysaysw': 'WHITE',
                 'swarthy': 'BLACK',
                 'd. brown': 'BLACK',
                 'col.d': 'BLACK',
                 'col\'d': 'BLACK',
                 }
    if row['race'] in race_dict.keys():
      return race_dict[row['race']]
    elif pd.notnull(row['race']):
      return 'OTHERS'
    else:
     return row['race']

df['race'] = df.apply(lambda row: transform_race(row), axis = 1)

In [None]:
df['race'].unique()

array(['WHITE', 'BLACK', 'OTHERS', 'MIXED', nan], dtype=object)

In [None]:
# locations (nameparser, fuzzy matching)
def transform_state(row):    
    """
    This function uses a dictionary
    to covert historical or non-abbreviated states or provinces
    into abbreviated form. 
    An important notice is that Canada West will all be converted into ON (Ontario).
    Unknown values: PANA, ITER, MIL.
    """
    state_dict = {'CanadaWest': 'ON',
                  'Ontario': 'ON',
                  'Canada West': 'ON',
                  'Pennsylvania': 'PA',
                  'Illinois': 'IL',
                  'Wisconsin': 'WI',
                  'District of Columbia': 'DC',
                  'Alabama': 'AL',
                  'Vermont': 'VT',
                  'Michigan': 'MI',
                  'Ohio': 'OH',
                  'Massachussetts': 'MA',
                  'Virginia': 'VA',
                  'Canada West (Ontario)': 'ON',
                  'New York': 'NY',
                  'toledo': 'OH'}
    
    if row['state_or_province'] in state_dict.keys():
        return state_dict[row['state_or_province']]
    else:
        return row['state_or_province']

df['state_or_province'] = df.apply(lambda row: transform_state(row), axis = 1) 

In [None]:
# dates
import datetime
def prc_year(row, year_col):
    """
    Processing birth year.
    If birth year is not a number, not in a date format, or is noncompliant,
    it will be converted to NaN.
    """
    
    # Helper functions
    def is_number(num):
        try:
            float(num)
            return True
        except ValueError:
            return False
    if (type(row[year_col]) is datetime.datetime):
        row[year_col] = row[year_col].strftime("%Y")
    non_compliant_values = ['-', 'F', '#VALUE!']
    if row[year_col] in non_compliant_values or pd.isnull(row[year_col]):
        return np.nan
    elif is_number(row[year_col]):
        return float(row[year_col])
    elif re.findall('\d{4}', row[year_col]):
        return min([float(i) for i in re.findall('\d{4}', row[year_col])])
    else:
        return np.nan

df['calculated_birth_year'] = df.apply(lambda row: prc_year(row, 'calculated_birth_year'),
                                       axis = 1)
df['census_year'] = df.apply(lambda row: prc_year(row, 'census_year'),
                             axis = 1)

In [None]:
# clean place of birth
def transform_birthplace(row):    
    """
    This function uses a dictionary
    to covert places of birth to cleaner strings
    """
    if pd.notnull(row['place_of_birth']):
      row['place_of_birth'] = str(row['place_of_birth'])
      row['place_of_birth'] = row['place_of_birth'].lower()
      row['place_of_birth'] = row['place_of_birth'].replace('\xa0', '')
      row['place_of_birth'] = row['place_of_birth'].replace('“', '')
      row['place_of_birth'] = row['place_of_birth'].replace('”', '')
      row['place_of_birth'] = row['place_of_birth'].replace('(', '')
      row['place_of_birth'] = row['place_of_birth'].replace(')', '')
      row['place_of_birth'] = row['place_of_birth'].replace('[', '')
      row['place_of_birth'] = row['place_of_birth'].replace(']', '')
 
      return row['place_of_birth']

df['place_of_birth'] = df.apply(lambda row: transform_birthplace(row), axis = 1) 

In [None]:
df['place_of_birth'].unique()

In [None]:
# fill empty columns
columns_to_impute_with_unknown = ['state_or_province',
                                  'county',
                                  'place',
                                  'ward',
                                  'street',
                                  'place_of_birth',
                                  'sex',
                                  'race',
                                  'marital_status',
                                  'profession',
                                  'religion']

for i in columns_to_impute_with_unknown:
    df[i] = df[i].fillna('unknown')

columns_to_impute_with_zero = ['roll_or_sheet',
                               'lat',
                               'long',]

for i in columns_to_impute_with_zero:
    df[i] = df[i].fillna(0)

In [None]:
# dealing with the 'unknowns'
chrs = 'abcdefghijklmnopqrstuvwxyz0123456789 '

location_columns = ['county', 'place', 'place_of_birth', 'state_or_province']

count = 0

for i, row in df.iterrows():
    if all([row[i] == 'unknown' for i in location_columns]):
      continue
    else:
      for l in location_columns:
        c = "".join(np.random.choice(list(chrs), 20))
        row[l] = c

# 3. Linkage

In [None]:
# match all
df1 = df[chosen_columns]
df2 = df[chosen_columns]

# block on phoenetic full name
indexer = rl.Index()
indexer.add(Block('phonetic_name', 'phonetic_name'))
record_links = indexer.index(df1, df2)


In [None]:
df[pd.isnull(df['Joint ID for Matched Records'])].shape


(50714, 86)

In [None]:
# unmatched VS matched
# df_withmatch = df[pd.notnull(df['Joint ID for Matched Records'])]
# df_nomatch = df[pd.isnull(df['Joint ID for Matched Records'])]
# chosen_columns.append("phonetic_name")

# df_withmatch = df_withmatch[chosen_columns]
# df_nomatch = df_nomatch[chosen_columns]

# indexer = rl.Index()
# indexer.add(Block('phonetic_name', 'phonetic_name'))
# record_links = indexer.index(df_withmatch, df_nomatch)

In [None]:
comparer = rl.Compare()
comparer.string('first_name', 'first_name', method = 'jarowinkler', threshold = 0.85, label = 'first_name')
comparer.string('last_name', 'last_name', method = 'jarowinkler', threshold = 0.85, label = 'last_name')
comparer.string('state_or_province', 'state_or_province', method = 'jarowinkler', threshold = 0.5, 
                label = 'state_or_province')
comparer.string('county', 'county', method = 'jarowinkler', threshold = 0.35, label = 'county')
comparer.string('place', 'place', method = 'jarowinkler', threshold = 0.35, label = 'place')
comparer.string('place_of_birth', 'place_of_birth', method = 'jarowinkler', threshold = 0.6, label = 'place_of_birth')
comparer.string('race', 'race', method = 'jarowinkler', threshold = 0.7, label = 'race')
comparer.string('sex', 'sex', method = 'jarowinkler', threshold = 0.7, label = 'sex')

comparer.numeric('calculated_birth_year', 'calculated_birth_year', 
                 method = 'gauss', 
                 offset = 1, 
                 scale = 1, 
                 label = 'calculated_birth_year')

<Compare>

Output Processing

In [None]:
import sys
sys.setrecursionlimit(1000000)

In [None]:
compare_vectors_rl = comparer.compute(record_links, df1, df2)

In [None]:
result_rl = compare_vectors_rl[((compare_vectors_rl['first_name'] == 1.0) & 
                                (compare_vectors_rl['last_name'] == 1.0)) &
                               ((compare_vectors_rl['state_or_province'] == 1.0) |
                                (compare_vectors_rl['county'] == 1.0) |
                                (compare_vectors_rl['place'] == 1.0) |
                                (compare_vectors_rl['place_of_birth'] == 1.0)) &
                               (compare_vectors_rl['sex'] == 1.0) &
                               (compare_vectors_rl['race'] == 1.0) & (
                               (compare_vectors_rl['calculated_birth_year'] > compare_vectors_rl['calculated_birth_year'].mean()
                               || compare_vectors_rl['calculated_birth_year'] == np.nan))]
                               reset_index()

result_rl = result_rl[result_rl['level_0'] != result_rl['level_1']].reset_index()
result_rl.drop('index', axis = 1, inplace = True)

In [None]:
def create_indexid(row):
    return "".join(sorted([str(int(i)) for i in [row['level_0'], row['level_1']]]))

result_rl['indexid'] = result_rl.apply(lambda row: create_indexid(row), axis = 1)
result_rl = result_rl.drop_duplicates('indexid')

In [None]:
# parse unsures into a dictionary
already_seen_pairs = dict()

for id, list_unsures in list(zip(df.uniqueid, df.unsures)):
  already_seen_pairs[id] = set(list_unsures[1:-1].split(", "))

In [None]:
df_result_rl = pd.DataFrame()

for i in zip(result_rl['level_0'], result_rl['level_1']):
  first = df.iloc[i[0]]
  second = df.iloc[i[1]]
  # if is an already-inputted match or if it's already been marked as unsure
  if first["Joint ID for Matched Records"] != second["Joint ID for Matched Records"]
    || first.uniqueid not in already_seen_pairs[second.uniqueid] || second.uniqueid in already_seen_pairs[first.uniqueid]:
    df_result_rl = df_result_rl.append(first)
    df_result_rl = df_result_rl.append(second)  

In [None]:
df_result_rl.reset_index(inplace = True)
df_result_rl = df_result_rl.drop('index', axis = 1)
df_result_rl.head()
df_result_rl.shape

(4108, 87)

In [None]:
num_list = []
for i in range(int(df_result_rl.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_rl['dup_pair'] = pd.Series(num_list)

In [None]:
col_order = ['dup_pair']
col_order.extend(df.columns.tolist())
df_result_rl = df_result_rl[col_order]
output = root + "/newmatches-01-03-21.xlsx"
df_result_rl.to_excel(output)

In [None]:
result_rl.head()