### 1. Setting Up

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import sys
sys.setrecursionlimit(1000000)

In [3]:
import pandas as pd
import numpy as np
import os

# Record Linkage
import recordlinkage as rl
from recordlinkage.index import Block
from recordlinkage.preprocessing import clean
from recordlinkage.preprocessing import phonetic

# Regular expression operations
import re

### 2. Preprocessing Data


In [4]:
df_with_matches = pd.read_excel("current_master_5_31_2020.xlsx")

In [5]:
df_with_matches

Unnamed: 0,ID,Census.Year,State/Province_x,County_x,Place_x,Household Joint ID,Joint ID for Matched Records,Last.Name,First.Name,CalculatedBirthYear,...,Rank (Military),Enlistment Date,Enlistment Place,Date Mustered Out,Year of this Record,Last Name MATCH,First Name Match,Census Year Match,Total of Matches,unique_id
0,236,1860,PA,PHILADELPHIA,PHILADELPHIA,,,---,42761860PRISO,1836,...,,,,,,0,0,0,FALSE,2407.0
1,1880 IPUMS 100% sample,1880,,,,,,---,---,1800,...,1000,101,,,,0,0,0,FALSE,2388.0
2,1880 IPUMS 100% sample,1880,,,,,,---,---,1808,...,5900,1230,,,,0,0,0,FALSE,2389.0
3,1880 IPUMS 100% sample,1880,,,,,,---,ELLA,1817,...,1000,101,,,,0,0,0,FALSE,2400.0
4,1880 IPUMS 100% sample,1880,,,,,,---,---,1818,...,5900,1230,,,,0,0,0,FALSE,2390.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50338,7719,1900,NJ,MIDDLESEX,NEWBRUNSWICK,,,ZONIE,HARVES,1883,...,,,,,,,,,,50806.0
50339,1880 IPUMS 100% sample,1880,,,,,,ZORCHEY,CHARLES,1862,...,5900,1230,,,,,,,,50807.0
50340,10315,1910,OR,CLATSOP,ASTORIA,,,ZUNTOLIAS,LOGAN,1885,...,,,,,,,,,,50808.0
50341,1880 IPUMS 100% sample,1880,,,,,,ZUVER,A.E.,1862,...,5310,1202,,,,,,,,50809.0


In [6]:
df_with_matches.shape

(50343, 46)

In [7]:
print("There are {} columns in df.".format(len(df_with_matches.columns)))
df_with_matches.columns

There are 46 columns in df.


Index(['ID', 'Census.Year', 'State/Province_x', 'County_x', 'Place_x',
       'Household Joint ID', 'Joint ID for Matched Records', 'Last.Name',
       'First.Name', 'CalculatedBirthYear', 'Age', 'Sex_x',
       'Color..Race.or.Ethnicity', 'lat_x', 'long_x', 'address', 'MARITAL',
       'WARD', 'ROLL or Sheet#', 'PROFESSION_x', 'Notable_x', 'STREET',
       'PLACEOFBIRTH_x', 'RELIGION_x',
       'NOTE these only apply to narrative answers', 'LIVING W MALE FAMILY?',
       'LIVING W FEMALE FAMILY?', 'LIVING W MALE NONFAMILY?',
       'LIVING W FEMALE NONFAMILY?', 'Cannot Read', 'Cannot Write', 'Sick',
       'Relation to Head of Household',
       'Year of Immigration to Canada if an Immigrant', 'Date of Death',
       'Cause of Death', 'Rank (Military)', 'Enlistment Date',
       'Enlistment Place', 'Date Mustered Out', 'Year of this Record',
       'Last Name MATCH', 'First Name Match', 'Census Year Match',
       'Total of Matches', 'unique_id'],
      dtype='object')

In [8]:
df_with_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50343 entries, 0 to 50342
Data columns (total 46 columns):
ID                                               43056 non-null object
Census.Year                                      50343 non-null object
State/Province_x                                 38399 non-null object
County_x                                         26413 non-null object
Place_x                                          38214 non-null object
Household Joint ID                               497 non-null object
Joint ID for Matched Records                     2875 non-null object
Last.Name                                        50342 non-null object
First.Name                                       50310 non-null object
CalculatedBirthYear                              50331 non-null object
Age                                              49497 non-null object
Sex_x                                            47637 non-null object
Color..Race.or.Ethnicity                  

In [9]:
df_with_matches['unique_id'] = df_with_matches.index + 1

In [10]:
df = df_with_matches.rename(columns = {# Personal Information
                                       'Last.Name': 'last_name',
                                       'First.Name': 'first_name', 
                                       # Places or Geography
                                       'State/Province_x': 'state_or_province', 
                                       'County_x': 'county', 
                                       'Place_x': 'place', 
                                       'WARD': 'ward', 
                                       'STREET': 'street', 
                                       'PLACEOFBIRTH_x': 'place_of_birth',
                                       'ROLL or Sheet#': 'roll_or_sheet',
                                       'lat_x': 'lat',
                                       'long_x': 'long',
                                       # Years
                                       'Census.Year': 'census_year', 
                                       'CalculatedBirthYear': 'calculated_birth_year', 
                                       # Personal information
                                       'Sex_x': 'sex', 
                                       'Color..Race.or.Ethnicity': 'race',
                                       'MARITAL': 'marital_status',
                                       'PROFESSION_x': 'profession',
                                       'Notable_x': 'notable',
                                       'RELIGION_x': 'religion'}) 

In [11]:
chosen_columns = ['last_name', 
                  'first_name',
                  'state_or_province',
                  'county',
                  'place',
                  'ward',
                  'street',
                  'place_of_birth',
                  'roll_or_sheet',
                  'lat',
                  'long',
                  'census_year',
                  'calculated_birth_year',
                  'sex',
                  'race',
                  'marital_status',
                  'profession',
                  'religion',
                  'unique_id']

In [12]:
len(chosen_columns)

19

In [13]:
df = df[chosen_columns]

### 3. Processing Years

In [14]:
def prc_year(row, year_col):
    """
    Processing birth year.
    If birth year is not a number, not in a date format, or is noncompliant,
    it will be converted to NaN.
    """
    
    # Helper functions
    def is_number(num):
        try:
            float(num)
            return True
        except ValueError:
            return False
    
    non_compliant_values = ['-', 'F', '#VALUE!']
    if row[year_col] in non_compliant_values or pd.isnull(row[year_col]):
        return np.nan
    elif is_number(row[year_col]):
        return float(row[year_col])
    elif re.findall('\d{4}', row[year_col]):
        return min([float(i) for i in re.findall('\d{4}', row[year_col])])
    else:
        return np.nan

In [15]:
df['calculated_birth_year'] = df.apply(lambda row: prc_year(row, 'calculated_birth_year'),
                                       axis = 1)

In [16]:
df['census_year'] = df.apply(lambda row: prc_year(row, 'census_year'),
                             axis = 1)

### 4. Preprocessing State or Province

In [17]:
def transform_state(row):    
    """
    This function uses a dictionary
    to covert historical or non-abbreviated states or provinces
    into abbreviated form. 
    An important notice is that Canada West will all be converted into ON (Ontario).
    Unknown values: PANA, ITER, MIL.
    """
    state_dict = {'CanadaWest': 'ON',
                  'Ontario': 'ON',
                  'Canada West': 'ON',
                  'Pennsylvania': 'PA',
                  'Illinois': 'IL',
                  'Wisconsin': 'WI',
                  'District of Columbia': 'DC',
                  'Alabama': 'AL',
                  'Vermont': 'VT',
                  'Michigan': 'MI',
                  'Ohio': 'OH',
                  'Massachussetts': 'MA',
                  'Virginia': 'VA',
                  'Canada West (Ontario)': 'ON',
                  'New York': 'NY',
                  'toledo': 'OH'}
    
    if row['state_or_province'] in state_dict.keys():
        return state_dict[row['state_or_province']]
    else:
        return row['state_or_province']

df['state_or_province'] = df.apply(lambda row: transform_state(row), axis = 1) 

### 5. Race

In [18]:
def transform_race(row):    
    
    if pd.notnull(row['race']):
        row['race'] = row['race'].lower()
        row['race'] = row['race'].replace('\xa0', '')
    
    race_dict = {'mulatto(blackandwhite)': 'MIXED',
                 'm(wonancestry.com)': 'MIXED',
                 'mulatto': 'MIXED',
                 'mullato': 'MIXED',
                 'm': 'MIXED',
                 'm(winancestry.com)': 'MIXED',
                 'black': 'BLACK',
                 'b': 'BLACK',
                 'blk': 'BLACK',
                 'african': 'BLACK',
                 'dark': 'BLACK',
                 'drk': 'BLACK',
                 'african (black)': 'BLACK',
                 '“negro”': 'BLACK',
                 'negro': 'BLACK',
                 'blacj': 'BLACK', 
                 'bkj': 'BLACK', 
                 'white': 'WHITE',
                 'w': 'WHITE',
                 '[w]': 'WHITE',
                 'white': 'WHITE',
                 'white in black household': 'WHITE',
                 'white but passing': 'WHITE',
                 'ancestrysaysw': 'WHITE'}
    
    if row['race'] in race_dict.keys():
        return race_dict[row['race']]
    elif pd.notnull(row['race']):
        return 'OTHERS'
    else:
        return np.nan

df['race'] = df.apply(lambda row: transform_race(row), axis = 1)

### 6. Create Full Name

In [19]:
df['full_name'] = df['first_name'] + '' + df['last_name']

### 7. Columns to Clean

In [20]:
column_to_clean = ['last_name',
                   'first_name',
                   'state_or_province',
                   'county',
                   'place',
                   'ward',
                   'street',
                   'place_of_birth',
                   'marital_status',
                   'race',
                   'sex',
                   'place_of_birth',
                   'marital_status',
                   'profession',
                   'religion']

for i in column_to_clean:
    df[i] = clean(df[i])

In [21]:
column_for_phonetic = ['last_name',
                       'first_name']

for i in column_for_phonetic:
    df[i] = phonetic(df[i], 'soundex')

### 8. Imputation

In [22]:
columns_to_impute_with_unknown = ['state_or_province',
                                  'county',
                                  'place',
                                  'ward',
                                  'street',
                                  'place_of_birth',
                                  'sex',
                                  'race',
                                  'marital_status',
                                  'profession',
                                  'religion']

for i in columns_to_impute_with_unknown:
    df[i] = df[i].fillna('unknown')

In [23]:
columns_to_impute_with_zero = ['roll_or_sheet',
                               'lat',
                               'long',]

for i in columns_to_impute_with_zero:
    df[i] = df[i].fillna(0)

### 9. Linkage - Block by First Name and Last Name

In [24]:
df1 = df
df2 = df

In [25]:
indexer = rl.Index()
indexer.add(Block('full_name', 'full_name'))
record_links = indexer.index(df1, df2)

  verify_integrity=False)


In [26]:
print(len(record_links))

98181


### 10. Compare

In [27]:
comparer = rl.Compare()
comparer.string('first_name', 'first_name', method = 'jarowinkler', threshold = 0.8, label = 'first_name')
comparer.string('full_name', 'full_name', method = 'jarowinkler', threshold = 0.5, label = 'full_name')
comparer.string('last_name', 'last_name', method = 'jarowinkler', threshold = 0.8, label = 'last_name')
comparer.string('state_or_province', 'state_or_province', method = 'jarowinkler', threshold = 0.5, 
                label = 'state_or_province')
comparer.string('county', 'county', method = 'jarowinkler', threshold = 0.3, label = 'county')
comparer.string('place', 'place', method = 'jarowinkler', threshold = 0.3, label = 'place')
comparer.string('place_of_birth', 'place_of_birth', method = 'jarowinkler', threshold = 0.6, label = 'place_of_birth')
comparer.string('race', 'race', method = 'jarowinkler', threshold = 0.7, label = 'race')
comparer.string('sex', 'sex', method = 'jarowinkler', threshold = 0.7, label = 'sex')

comparer.numeric('calculated_birth_year', 'calculated_birth_year', 
                 method = 'gauss', 
                 offset = 1, 
                 scale = 1, 
                 label = 'calculated_birth_year')

<Compare>

In [28]:
compare_vectors_rl = comparer.compute(record_links, df1, df2)
compare_vectors_rl[0:20]

Unnamed: 0,Unnamed: 1,first_name,full_name,last_name,state_or_province,county,place,place_of_birth,race,sex,calculated_birth_year
0,0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,2,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.776357e-15
1,4,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0053819999999999e-87
1,5,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,2.926048e-98
1,7,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1,10,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1,12,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
1,13,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
1,28,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


In [29]:
result_rl = compare_vectors_rl[((compare_vectors_rl['first_name'] == 1.0) | 
                                (compare_vectors_rl['last_name'] == 1.0) |
                                (compare_vectors_rl['full_name'] == 1.0)) &
                               ((compare_vectors_rl['state_or_province'] == 1.0) |
                                (compare_vectors_rl['county'] == 1.0) |
                                (compare_vectors_rl['place'] == 1.0) |
                                (compare_vectors_rl['place_of_birth'] == 1.0)) &
                               (compare_vectors_rl['sex'] == 1.0) &
                               (compare_vectors_rl['race'] == 1.0) &
                               (compare_vectors_rl['calculated_birth_year'] > compare_vectors_rl['calculated_birth_year'].mean())].\
                               reset_index()

result_rl = result_rl[result_rl['level_0'] != result_rl['level_1']].reset_index()
result_rl.drop('index', axis = 1, inplace = True)

### 11. Final Processing

In [30]:
def create_indexid(row):
    return "".join(sorted([str(int(i)) for i in [row['level_0'], row['level_1']]]))

result_rl['indexid'] = result_rl.apply(lambda row: create_indexid(row), axis = 1)
result_rl = result_rl.drop_duplicates('indexid')

In [31]:
df_result_rl = pd.DataFrame()

for i in zip(result_rl['level_0'], result_rl['level_1']):
    df_result_rl = df_result_rl.append(df.iloc[i[0]])
    df_result_rl = df_result_rl.append(df.iloc[i[1]])   

In [32]:
df_result_rl.reset_index(inplace = True)
df_result_rl = df_result_rl.drop('index', axis = 1)

In [33]:
num_list = []
for i in range(int(df_result_rl.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_rl['dup_pair'] = pd.Series(num_list)

In [34]:
df_result_rl

Unnamed: 0,calculated_birth_year,census_year,county,first_name,full_name,last_name,lat,long,marital_status,place,...,profession,race,religion,roll_or_sheet,sex,state_or_province,street,unique_id,ward,dup_pair
0,1895.0,1920.0,penobscot,E652,ERNEST???,,44.8012,-68.7778,unknown,bangor,...,unknown,black,unknown,0,m,me,unknown,96.0,unknown,0
1,1895.0,1920.0,penobscot,E652,ERNEST???,,44.8016,-68.7712,unknown,bangor,...,unknown,black,unknown,0,m,me,unknown,97.0,unknown,0
2,1845.0,1870.0,warren,A416,ALBERTADAMS,A352,39.4884,-84.0273,unknown,massietwp,...,unknown,mixed,unknown,0,m,oh,unknown,185.0,unknown,1
3,1845.0,1880.0,warren,A416,ALBERTADAMS,A352,39.4884,-84.0273,unknown,massietwp,...,laborer,mixed,unknown,0,m,oh,unknown,200.0,unknown,1
4,1877.0,1910.0,erie,J500,JANEALEXANDER,A425,42.8864,-78.8784,unknown,buffalo,...,unknown,mixed,unknown,0,f,ny,unknown,414.0,unknown,2
5,1878.0,1920.0,wayne,J500,JANEALEXANDER,A425,42.3314,-83.0458,unknown,detroit,...,unknown,mixed,unknown,0,f,mi,unknown,425.0,unknown,2
6,1824.0,1860.0,ontario,S540,SAMUELALLEN,A450,42.8875,-77.2817,unknown,canadaigua,...,unknown,black,unknown,0,m,ny,unknown,457.0,unknown,3
7,1823.0,1870.0,ontario,S540,SAMUELALLEN,A450,42.8875,-77.2817,unknown,canandaigua,...,unknown,black,unknown,0,m,ny,unknown,465.0,unknown,3
8,1824.0,1860.0,ontario,S540,SAMUELALLEN,A450,42.8875,-77.2817,unknown,canadaigua,...,unknown,black,unknown,0,m,ny,unknown,457.0,unknown,4
9,1824.0,1880.0,ontario,S540,SAMUELALLEN,A450,42.8875,-77.2817,unknown,canandaigua,...,barber,black,unknown,0,m,ny,unknown,475.0,unknown,4


In [35]:
df_result_rl.to_csv('20200609_result_1.csv')

### 12. Compare - More Linkage

In [36]:
df1 = df
df2 = df

In [37]:
indexer = rl.Index()
indexer.add(Block('full_name', 'full_name'))
record_links = indexer.index(df1, df2)

In [38]:
comparer = rl.Compare()
comparer.string('first_name', 'first_name', method = 'jarowinkler', threshold = 0.8, label = 'first_name')
comparer.string('full_name', 'full_name', method = 'jarowinkler', threshold = 0.5, label = 'full_name')
comparer.string('last_name', 'last_name', method = 'jarowinkler', threshold = 0.8, label = 'last_name')
comparer.string('state_or_province', 'state_or_province', method = 'jarowinkler', threshold = 0.5, 
                label = 'state_or_province')
comparer.string('county', 'county', method = 'jarowinkler', threshold = 0.3, label = 'county')
comparer.string('place', 'place', method = 'jarowinkler', threshold = 0.3, label = 'place')
comparer.string('place_of_birth', 'place_of_birth', method = 'jarowinkler', threshold = 0.6, label = 'place_of_birth')
comparer.string('race', 'race', method = 'jarowinkler', threshold = 0.7, label = 'race')
comparer.string('sex', 'sex', method = 'jarowinkler', threshold = 0.7, label = 'sex')
comparer.string('profession', 'profession', method = 'jarowinkler', 
                threshold = 0.5, label = 'profession')
comparer.string('religion', 'religion', method = 'jarowinkler', 
                threshold = 0.5, label = 'religion')
comparer.string('marital_status', 'marital_status', method = 'jarowinkler', 
                 threshold = 0.5, label = 'marital_status')

comparer.numeric('calculated_birth_year', 'calculated_birth_year', 
                 method = 'gauss', 
                 offset = 1, 
                 scale = 1, 
                 label = 'calculated_birth_year')

<Compare>

In [39]:
compare_vectors_rl = comparer.compute(record_links, df1, df2)
compare_vectors_rl[0:20]

Unnamed: 0,Unnamed: 1,first_name,full_name,last_name,state_or_province,county,place,place_of_birth,race,sex,profession,religion,marital_status,calculated_birth_year
0,0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
1,2,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.776357e-15
1,4,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0053819999999999e-87
1,5,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,2.926048e-98
1,7,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,10,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,12,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,13,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
1,28,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


In [40]:
result_rl = compare_vectors_rl[((compare_vectors_rl['first_name'] == 1.0) | 
                                (compare_vectors_rl['last_name'] == 1.0) |
                                (compare_vectors_rl['full_name'] == 1.0)) &
                               ((compare_vectors_rl['state_or_province'] == 1.0) |
                                (compare_vectors_rl['county'] == 1.0) |
                                (compare_vectors_rl['place'] == 1.0) |
                                (compare_vectors_rl['place_of_birth'] == 1.0)) &
                               ((compare_vectors_rl['sex'] == 1.0) |
                                (compare_vectors_rl['profession'] == 1.0) |
                                (compare_vectors_rl['religion'] == 1.0) |
                                (compare_vectors_rl['marital_status'] == 1.0)|
                                (compare_vectors_rl['race'] == 1.0)) &
                               (compare_vectors_rl['calculated_birth_year'] > compare_vectors_rl['calculated_birth_year'].mean())].\
                               reset_index()

result_rl = result_rl[result_rl['level_0'] != result_rl['level_1']].reset_index()
result_rl.drop('index', axis = 1, inplace = True)

In [41]:
result_rl['indexid'] = result_rl.apply(lambda row: create_indexid(row), axis = 1)
result_rl = result_rl.drop_duplicates('indexid')

In [42]:
df_result_rl = pd.DataFrame()

for i in zip(result_rl['level_0'], result_rl['level_1']):
    df_result_rl = df_result_rl.append(df.iloc[i[0]])
    df_result_rl = df_result_rl.append(df.iloc[i[1]])   

In [43]:
df_result_rl.reset_index(inplace = True)
df_result_rl = df_result_rl.drop('index', axis = 1)

In [44]:
num_list = []
for i in range(int(df_result_rl.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_rl['dup_pair'] = pd.Series(num_list)

In [45]:
df_result_rl

Unnamed: 0,calculated_birth_year,census_year,county,first_name,full_name,last_name,lat,long,marital_status,place,...,profession,race,religion,roll_or_sheet,sex,state_or_province,street,unique_id,ward,dup_pair
0,1818.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,,white,unknown,9,m,unknown,unknown,5.0,unknown,0
1,1819.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,,black,unknown,9,m,unknown,unknown,6.0,unknown,0
2,1857.0,1880.0,unknown,,------,,0,4.1124e+09,unknown,unknown,...,laborer,mixed,unknown,2,m,unknown,unknown,29.0,unknown,1
3,1857.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,worksonfarm,black,unknown,9,m,unknown,unknown,30.0,unknown,1
4,1857.0,1880.0,unknown,,------,,0,4.1124e+09,unknown,unknown,...,laborer,mixed,unknown,2,m,unknown,unknown,29.0,unknown,2
5,1858.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,,white,unknown,9,m,unknown,unknown,32.0,unknown,2
6,1857.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,worksonfarm,black,unknown,9,m,unknown,unknown,30.0,unknown,3
7,1858.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,,white,unknown,9,m,unknown,unknown,32.0,unknown,3
8,1895.0,1920.0,penobscot,E652,ERNEST???,,44.8012,-68.7778,unknown,bangor,...,unknown,black,unknown,0,m,me,unknown,96.0,unknown,4
9,1895.0,1920.0,penobscot,E652,ERNEST???,,44.8016,-68.7712,unknown,bangor,...,unknown,black,unknown,0,m,me,unknown,97.0,unknown,4


In [46]:
df_result_rl.to_csv('20200609_result_2.csv')

### 13. Compare - Even More Linkage

In [47]:
df1 = df
df2 = df

In [48]:
indexer = rl.Index()
indexer.add(Block('full_name', 'full_name'))
record_links = indexer.index(df1, df2)

In [49]:
comparer = rl.Compare()
comparer.string('first_name', 'first_name', method = 'jarowinkler', threshold = 0.8, label = 'first_name')
comparer.string('full_name', 'full_name', method = 'jarowinkler', threshold = 0.5, label = 'full_name')
comparer.string('last_name', 'last_name', method = 'jarowinkler', threshold = 0.8, label = 'last_name')
comparer.string('state_or_province', 'state_or_province', method = 'jarowinkler', threshold = 0.5, 
                label = 'state_or_province')
comparer.string('county', 'county', method = 'jarowinkler', threshold = 0.3, label = 'county')
comparer.string('place', 'place', method = 'jarowinkler', threshold = 0.3, label = 'place')
comparer.string('place_of_birth', 'place_of_birth', method = 'jarowinkler', threshold = 0.6, label = 'place_of_birth')
comparer.string('ward', 'ward', method = 'jarowinkler', threshold = 0.3, label = 'ward')
comparer.string('street', 'street', method = 'jarowinkler', threshold = 0.3, label = 'street')
comparer.string('race', 'race', method = 'jarowinkler', threshold = 0.7, label = 'race')
comparer.string('sex', 'sex', method = 'jarowinkler', threshold = 0.7, label = 'sex')
comparer.string('profession', 'profession', method = 'jarowinkler', 
                threshold = 0.5, label = 'profession')
comparer.string('religion', 'religion', method = 'jarowinkler', 
                threshold = 0.5, label = 'religion')
comparer.string('marital_status', 'marital_status', method = 'jarowinkler', 
                 threshold = 0.5, label = 'marital_status')

comparer.numeric('calculated_birth_year', 'calculated_birth_year', 
                 method = 'gauss', 
                 offset = 1, 
                 scale = 1, 
                 label = 'calculated_birth_year')

<Compare>

In [50]:
compare_vectors_rl = comparer.compute(record_links, df1, df2)
compare_vectors_rl[0:20]

Unnamed: 0,Unnamed: 1,first_name,full_name,last_name,state_or_province,county,place,place_of_birth,ward,street,race,sex,profession,religion,marital_status,calculated_birth_year
0,0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
1,2,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.776357e-15
1,4,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0053819999999999e-87
1,5,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,2.926048e-98
1,7,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,10,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,12,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,13,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
1,28,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0


In [51]:
result_rl = compare_vectors_rl[((compare_vectors_rl['first_name'] == 1.0) | 
                                (compare_vectors_rl['last_name'] == 1.0) |
                                (compare_vectors_rl['full_name'] == 1.0)) &
                               ((compare_vectors_rl['state_or_province'] == 1.0) |
                                (compare_vectors_rl['county'] == 1.0) |
                                (compare_vectors_rl['place'] == 1.0) |
                                (compare_vectors_rl['place_of_birth'] == 1.0) |
                                (compare_vectors_rl['street'] == 1.0) |
                                (compare_vectors_rl['ward'] == 1.0)) &
                               ((compare_vectors_rl['sex'] == 1.0) |
                                (compare_vectors_rl['profession'] == 1.0) |
                                (compare_vectors_rl['religion'] == 1.0) |
                                (compare_vectors_rl['marital_status'] == 1.0)|
                                (compare_vectors_rl['race'] == 1.0)) &
                               (compare_vectors_rl['calculated_birth_year'] > compare_vectors_rl['calculated_birth_year'].mean())].\
                               reset_index()

result_rl = result_rl[result_rl['level_0'] != result_rl['level_1']].reset_index()
result_rl.drop('index', axis = 1, inplace = True)

In [52]:
result_rl['indexid'] = result_rl.apply(lambda row: create_indexid(row), axis = 1)
result_rl = result_rl.drop_duplicates('indexid')

In [53]:
df_result_rl = pd.DataFrame()

for i in zip(result_rl['level_0'], result_rl['level_1']):
    df_result_rl = df_result_rl.append(df.iloc[i[0]])
    df_result_rl = df_result_rl.append(df.iloc[i[1]])   

In [54]:
df_result_rl.reset_index(inplace = True)
df_result_rl = df_result_rl.drop('index', axis = 1)

In [55]:
num_list = []
for i in range(int(df_result_rl.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_rl['dup_pair'] = pd.Series(num_list)

In [56]:
df_result_rl

Unnamed: 0,calculated_birth_year,census_year,county,first_name,full_name,last_name,lat,long,marital_status,place,...,profession,race,religion,roll_or_sheet,sex,state_or_province,street,unique_id,ward,dup_pair
0,1818.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,,white,unknown,9,m,unknown,unknown,5.0,unknown,0
1,1819.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,,black,unknown,9,m,unknown,unknown,6.0,unknown,0
2,1857.0,1880.0,unknown,,------,,0,4.1124e+09,unknown,unknown,...,laborer,mixed,unknown,2,m,unknown,unknown,29.0,unknown,1
3,1857.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,worksonfarm,black,unknown,9,m,unknown,unknown,30.0,unknown,1
4,1857.0,1880.0,unknown,,------,,0,4.1124e+09,unknown,unknown,...,laborer,mixed,unknown,2,m,unknown,unknown,29.0,unknown,2
5,1858.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,,white,unknown,9,m,unknown,unknown,32.0,unknown,2
6,1857.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,worksonfarm,black,unknown,9,m,unknown,unknown,30.0,unknown,3
7,1858.0,1880.0,unknown,,------,,0,3.04651e+09,unknown,unknown,...,,white,unknown,9,m,unknown,unknown,32.0,unknown,3
8,1895.0,1920.0,penobscot,E652,ERNEST???,,44.8012,-68.7778,unknown,bangor,...,unknown,black,unknown,0,m,me,unknown,96.0,unknown,4
9,1895.0,1920.0,penobscot,E652,ERNEST???,,44.8016,-68.7712,unknown,bangor,...,unknown,black,unknown,0,m,me,unknown,97.0,unknown,4


In [57]:
df_result_rl.to_csv('20200609_result.csv')