In [1]:
# # 1/12/20: editing this function so that it can handle different column names 
# # and match from data to the real file

# this script has 2 sections. first, can detect and drop duplicates from the master. second, can take in a list 
# of matches marked as duplicates and drop them from the master. when dropping it combines data from both records
# so that no info is lost 

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
import recordlinkage as rl
from recordlinkage.index import Block
from recordlinkage.preprocessing import clean

In [3]:
df = pd.read_excel("current_master_3_21_2021.xlsx")

In [4]:
df_filtered = df

In [5]:
df.head()

Unnamed: 0,unique_id,source,Census.Year,State.Province,County,Place,unsure_ids,no_ids,Household.Joint.ID,Joint.ID.for.Matched.Records,...,Height,Complex.,Father.of.Foreign.Birth,Mother.of.Foreign.Birth,Township,Sheet.Number,Noonan.Page.Number,Employment,Whitef,full_name
0,2,9247,1900.0,Maine,Washington,machias,[],[],,1,...,,,,,,,,,,albertjaddison
1,3,,1864.0,Canada West,Essex,windsor,[],[],,1,...,,,,,,,,,,jamesaddison
2,4,1880 IPUMS 100% sample,1880.0,Michigan,Wayne,detroit,[],[],133.0,2,...,,,,,,,,,,markweeks
3,47806,,1864.0,Canada West,Essex,windsor,[],[],133.0,2,...,,,,,,,,,,markweeks
4,5,,1864.0,Canada West,Essex,windsor,[],[],133.0,2,...,,,,,,,,,,markweeks


In [6]:
# some had sources like "1872 census" but did not have a census year
def process_census(i):
    source = df_filtered.loc[i, 'source']
    census_year = df_filtered.loc[i, 'Census.Year']
    if np.isnan(census_year):
        year = re.findall("[1-2][0-9][0-9][0-9]", source)
        if len(year) == 1:
            return(year)
        else:
            return(np.nan)
    else:
        return(census_year)

In [7]:
for i in range(0, len(df_filtered.index)):
    df_filtered.loc[i, 'Census.Year'] = process_census(i)

In [8]:
# create full name column 
full_names = df_filtered['First.Name'] + ' ' + df_filtered['Last.Name']
full_names = full_names.str.lower()
full_names = full_names.str.replace(" ", "")
full_names = full_names.str.replace('[^A-Za-z]+', "")
df_filtered['full_name'] = full_names

In [9]:

# preprocess out weird things like "3-wdbloomington" or "boston;10-wd"
def prc_place(place):
    place = place.lower()
    re.sub("^wd", "", place)
    place = re.sub("[0-9]+-wd", "", place)
    place = re.sub('[^A-Za-z]+', "", place) # take out all special characters
    place = place.replace(" ", "")
    return(place)

df_filtered['Place'] = df_filtered['Place'].map(prc_place, na_action = 'ignore')

In [10]:
def prc_year(row, year_col):
    """
    Processing birth year.
    If birth year is not a number, not in a date format, or is noncompliant,
    it will be converted to NaN.
    """
    
    # Helper functions
    def is_number(num):
        try:
            float(num)
            return True
        except ValueError:
            return False
    
    non_compliant_values = ['-', 'F', '#VALUE!']
    if row[year_col] in non_compliant_values or pd.isnull(row[year_col]):
        return np.nan
    elif is_number(row[year_col]):
        return float(row[year_col])
    elif re.findall('\d{4}', row[year_col]):
        return min([float(i) for i in re.findall('\d{4}', row[year_col])])
    else:
        return np.nan
    
df_filtered['CalculatedBirthYear'] = df_filtered.apply(lambda row: prc_year(row, 'CalculatedBirthYear'),
                                                         axis = 1)
df_filtered['Census.Year'] = df_filtered.apply(lambda row: prc_year(row, 'Census.Year'),
                                                         axis = 1)

In [51]:
# convert age to numeric when possible 
df_filtered['Age'] = pd.to_numeric(df_filtered['Age'], errors = 'coerce')

In [52]:
  df1 = df_filtered
df2 = df_filtered

In [53]:
indexer = rl.Index()
indexer.add(Block('full_name', 'full_name')) # blocks on full name being the same 
record_links = indexer.index(df1, df2)

In [54]:
comparer = rl.Compare()
#comparer.string('First.Name', 'first_name', method = 'jarowinkler', threshold = 0.9, label = 'first_name')
#comparer.string('last_name', 'last_name', method = 'jarowinkler', threshold = 0.9, label = 'last_name')
#comparer.string('state_or_province', 'state_or_province', method = 'jarowinkler', threshold = 0.9, label = 'state')
#comparer.string('county', 'county', method = 'jarowinkler', threshold = 0.9, label = 'county')
comparer.string('Place', 'Place', method = 'jarowinkler', threshold = 0.9, label = 'place', missing_value = 2)
#comparer.exact('race', 'race', label = 'race')
#comparer.exact('sex', 'sex', label = 'sex')
comparer.numeric('CalculatedBirthYear', 'CalculatedBirthYear', label = 'CalculatedBirthYear', missing_value = 2)
comparer.numeric('Age', 'Age', label = 'Age', missing_value = 2)

comparer.string('source', 'source', method = 'jarowinkler', threshold = 0.95, label = 'source', missing_value = 2)
comparer.exact('Census.Year', 'Census.Year', label = 'Census.Year', missing_value = 2)

<Compare>

In [28]:
# # most strict, definite matches
# compare_vectors_rl = comparer.compute(record_links, df1, df2)
# result_rl = compare_vectors_rl[(compare_vectors_rl['place'] == 1.0) & 
#                                ((compare_vectors_rl['CalculatedBirthYear'] == 1) | (compare_vectors_rl['Age'] == 1)) &
#                                 (compare_vectors_rl['source'] == 1) &
#                                 (compare_vectors_rl['Census.Year'] == 1)].\
#                                 reset_index()
# results_rl = compare_vectors_rl.reset_index()

# def create_indexid(row):
#     return "".join(sorted([str(int(i)) for i in [row['level_0'], row['level_1']]]))
# result_rl['indexid'] = result_rl.apply(lambda row: create_indexid(row), axis = 1)
# result_rl = result_rl.drop_duplicates('indexid')

# result_rl = result_rl[result_rl['level_0'] != result_rl['level_1']].reset_index()

In [16]:
# df_result_rl = pd.DataFrame()


# for i in set(zip(result_rl['level_0'], result_rl['level_1'])):
#     df_result_rl = df_result_rl.append(df_filtered.iloc[i[0]])
#     df_result_rl = df_result_rl.append(df_filtered.iloc[i[1]])   

# df_result_rl = df_result_rl.reindex(df_filtered.columns, axis=1)

# df_result_rl

Unnamed: 0,unique_id,source,Census.Year,State.Province,County,Place,unsure_ids,no_ids,Household.Joint.ID,Joint.ID.for.Matched.Records,...,Height,Complex.,Father.of.Foreign.Birth,Mother.of.Foreign.Birth,Township,Sheet.Number,Noonan.Page.Number,Employment,Whitef,full_name
33453,60650.0,1901 census Victoria,1901.0,British Columbia,Victoria,tolmie,[59652],[],142.0,,...,,,,,,,,,,charleswatson
33454,60649.0,1901 census Victoria,1901.0,British Columbia,Victoria,tolmie,[59698],[],142.0,,...,,,,,,,,,,charleswatson
13244,60582.0,1901 census Victoria,1901.0,British Columbia,Victoria,victoriacity,[],[],154.0,,...,,,,,,,,,,mabelcarter
14311,60581.0,1901 census Victoria,1901.0,British Columbia,Victoria,victoriacity,[],[],140.0,,...,,,,,,,,,,mabelcarter


In [None]:
## df_result_rl.to_csv('definite_duplications_3_4_21.csv')

In [56]:
# less strict matches, feel free to play around with thresholds 

compare_vectors_rl = comparer.compute(record_links, df1, df2)
result_rl = compare_vectors_rl[(compare_vectors_rl['place'] == 1) & # place matches 
                               ((compare_vectors_rl['CalculatedBirthYear'].isin([.5, 1])) | (compare_vectors_rl['Age'].isin([.5, 1]))) &
                               ( # this allows the calculated birth year and age to be a year off 
                                   # source matches and census is either matching or missing 
                               ((compare_vectors_rl['source'] == 1) & (compare_vectors_rl['Census.Year'].isin([1, 2])))
                                   # source doesn't match but census year does
                                   | (compare_vectors_rl['Census.Year'] == 1))].reset_index()
result_rl['indexid'] = result_rl.apply(lambda row: create_indexid(row), axis = 1)
result_rl = result_rl.drop_duplicates('indexid')
result_rl = result_rl[result_rl['level_0'] != result_rl['level_1']].reset_index()

df_result_rl = pd.DataFrame()

for i in zip(result_rl['level_0'], result_rl['level_1']):
    df_result_rl = df_result_rl.append(df_filtered.iloc[i[0]])
    df_result_rl = df_result_rl.append(df_filtered.iloc[i[1]])   

df_result_rl = df_result_rl.reindex(df_filtered.columns, axis=1)

#df_result_rl

In [57]:
df_result_rl = df_result_rl.reset_index()

In [58]:
len(df_result_rl.index)

156

In [59]:
df_result_rl.head()

Unnamed: 0,index,unique_id,source,Census.Year,State.Province,County,Place,unsure_ids,no_ids,Household.Joint.ID,...,Height,Complex.,Father.of.Foreign.Birth,Mother.of.Foreign.Birth,Township,Sheet.Number,Noonan.Page.Number,Employment,Whitef,full_name
0,502,484.0,,1864.0,Canada West,Essex,windsor,[],[],60.0,...,,,,,,,,,,jameslawrence
1,503,483.0,,1864.0,Canada West,Essex,windsor,[],[],60.0,...,,,,,,,,,,jameslawrence
2,32866,44186.0,1880 IPUMS 100% sample,1880.0,New York,,newyork,[44448],[],,...,,,,,,,,,,williamtaylor
3,34234,44185.0,1880 IPUMS 100% sample,1880.0,New York,,newyork,[],[],,...,,,,,,,,,,williamtaylor
4,14584,21096.0,1880 IPUMS 100% sample,1880.0,Massachusetts,Suffolk,boston,"[21089, 21096, 20874, 20825, 20827]","[20825, 21089, 20827]",,...,,,,,,,,,,williamhharris


In [60]:
df_result_rl.to_excel('duplications_4_8_21.xlsx')

In [61]:
test_df = df

In [62]:
def parse_list(x):
    if x == "[]":
        return []
    else:
        x = re.sub("\[", "", x)
        x = re.sub("\]", "", x)
        mylist = x.split(",")

        return([int(i) for i in mylist])

In [63]:
# actually remove from master

# combine the two records so we're not losing any info

for i in range(0, len(df_result_rl.index), 2):
    unique_id_1 = df_result_rl.loc[i, 'unique_id']
    unique_id_2 = df_result_rl.loc[i + 1, 'unique_id']

    row_1 = test_df[test_df['unique_id'] == unique_id_1]
    row_2 = test_df[test_df['unique_id'] == unique_id_2]
    
    index_1 = row_1.index.values[0]
    index_2 = row_2.index.values[0]

    for col in df.columns:
        val_1 = row_1[col].values[0]
        val_2 = row_2[col].values[0]
        
        # only reassign if there is a mismatch
        if val_1 != val_2:
            
            # these two columns are lists
            if col in ['unsure_ids', 'no_ids']:
                if val_1 == "[]" and val_2 == "[]":
                    pass
                else:
                    test_df.loc[index_1, col] = str(list(set(parse_list(val_1) + parse_list(val_2))))
            
            # if there is a mismatch in these columns it needs to be manually examined and fixed
            elif col in ['Household Joint ID', 'Joint ID for Matched Records']:
                if pd.isna(val_1) and pd.isna(val_2):
                    pass
                else:
                    test_df.loc[index_1, col] = "EXAMINE"
                    
            # for all other columns, fill in as necessary 
            else:

                # if both nan or second is nan, do nothing
                if pd.isna(val_1) and pd.isna(val_2):
                    pass
                elif pd.isna(val_2):
                    pass

                # if first is nan but second exists, replace first with second 
                elif pd.isna(val_1):
                    test_df.loc[index_1, col] = val_2

                # else replace with longer cell-tends to be more informative
                elif len(str(val_1)) < len(str(val_2)):
                    test_df.loc[index_1, col] = val_2
    test_df = test_df.drop(index_2)

In [65]:
test_df.to_excel("current_master_4_8_21.xlsx")

In [None]:
######### section 2: if given list of dups

In [205]:
dups = pd.read_excel("manual_dups_3_4_21.xlsx")
dups.head()

Unnamed: 0,Arenson,unique_id
0,dup,36235
1,keep,36234
2,dup,35573
3,keep,35574
4,dup,32062


In [38]:
deleted_ids = []
for i in df['unique_id'].unique():
    if i not in test_df['unique_id']:
        deleted_ids.append(i)

In [39]:
len(deleted_ids)

4338

In [44]:
matches = pd.read_excel("AIA on newmatches-3-17-21.xlsx")
matches.head()

Unnamed: 0,AIA decision,Unnamed: 1,dup_pair,unique_id,source,census_year,state_or_province,county,place,unsure_ids,...,Height,Complex.,Father.of.Foreign.Birth,Mother.of.Foreign.Birth,Township,Sheet.Number,Noonan.Page.Number,Employment,Whitef,phonetic_name
0,unsure,0,0,48688,1880 IPUMS 100% sample,1880.0,unknown,unknown,unknown,"[49221, 49093]",...,,,,,,,,,,W452G620
1,unsure,1,0,49094,13040,1910.0,Illinois,Will,Joliet,"[1756, 49093, 49221, 48632, 49093]",...,,,,,,,,,,W452G620
2,unsure,2,1,49093,4475,1900.0,Missouri,Jackson,Kansascity,"[1756, 49094, 49221, 49094, 48688]",...,,,,,,,,,,W452G620
3,unsure,3,1,49094,13040,1910.0,Illinois,Will,Joliet,"[1756, 49093, 49221, 48632, 49093]",...,,,,,,,,,,W452G620
4,dup,4,2,3468,1900 IPUMS 5% sample,1900.0,unknown,unknown,unknown,[],...,,,,,,,,,,A632E420


In [116]:
for i in range(0, len(df_result_rl.index), 2):
    is_dup = False
    unique_id_1 = df_result_rl.loc[i, 'unique_id']
    unique_id_2 = df_result_rl.loc[i + 1, 'unique_id']

    if unique_id_1 in matches['unique_id'].unique():
        row = matches[matches['unique_id'] == unique_id_1]
        pair_number = [int(i) for i in row['dup_pair']]
        for this_pair_number in pair_number:
            matched_row = matches[matches['dup_pair'] == this_pair_number]
            matched_row = matched_row[matches['unique_id'] == unique_id_2]
            if len(matched_row.index) > 0:
                is_dup = True
                idx_1 = row.index
                idx_2 = matched_row.index
    elif unique_id_2 in matches['unique_id'].unique():
        row = matches[matches['unique_id'] == unique_id_2]
        pair_number = [int(i) for i in row['dup_pair']]
        for this_pair_number in pair_number:
            matched_row = matches[matches['dup_pair'] == this_pair_number]
            matched_row = matched_row[matches['unique_id'] == unique_id_1]    
            if len(matched_row.index) > 0:
                is_dup = True
                idx_1 = row.index
                idx_2 = matched_row.index

    if is_dup:
        matches = matches.drop(idx_1)
        matches = matches.drop(idx_2)

  # This is added back by InteractiveShellApp.init_path()


In [117]:
len(matches.index)

1498

In [119]:
matches.to_excel("deduped_matches_3_18_21.xlsx")