### 1. Setting Up

This is a preliminary work to identify records that belong to each person using the recordlinkage package.

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
!pip install recordlinkage

Collecting recordlinkage
[?25l  Downloading https://files.pythonhosted.org/packages/db/26/babbca39d74824e8bc17428a8eb04951a1d63318af7d02beeb2106a1ec26/recordlinkage-0.14-py3-none-any.whl (944kB)
[K     |████████████████████████████████| 952kB 2.9MB/s eta 0:00:01
[?25hCollecting jellyfish>=0.5.4 (from recordlinkage)
[?25l  Downloading https://files.pythonhosted.org/packages/3f/80/bcacc7affb47be7279d7d35225e1a932416ed051b315a7f9df20acf04cbe/jellyfish-0.7.2.tar.gz (133kB)
[K     |████████████████████████████████| 143kB 6.2MB/s eta 0:00:01
Building wheels for collected packages: jellyfish
  Building wheel for jellyfish (setup.py) ... [?25ldone
[?25h  Created wheel for jellyfish: filename=jellyfish-0.7.2-cp37-cp37m-macosx_10_9_x86_64.whl size=23521 sha256=5135d4aeaea7c6a289a3b7bf619858a4b4045a73fe732209fdd52f0f55b443ed
  Stored in directory: /Users/huynguyen/Library/Caches/pip/wheels/e8/fe/99/d8fa8f2ef7b82a625b0b77a84d319b0b50693659823c4effb4
Successfully built jellyfish
Installing c

In [2]:
import pandas as pd
import numpy as np
import os

# Record Linkage
import recordlinkage as rl
from recordlinkage.index import Block
from recordlinkage.index import SortedNeighbourhood
from recordlinkage.preprocessing import clean

# Regular expression operations
import re

In [3]:
# Helper functions
def is_number(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

In [4]:
os.chdir("/Users/huynguyen/Desktop/DSI_DFG")
df = pd.read_excel(os.getcwd() + 
                   "/1860-1930 database for family tracking Adam manual AFAM CANADA current.xlsx")
df.head(5)

Unnamed: 0,CalculatedBirthYear,ID,Census.Year,State/Province,County,Place,Last.Name,First.Name,Age,Sex,...,Enlistment Place,Date Mustered Out,Year of this Record,Unnamed: 41,Unnamed: 42,Last Name MATCH,First Name Match,Census Year Match,Unnamed: 46,Total of Matches
0,1757,,1850,LA,,New Orleans,Jeffries,?Ucky,93,M,...,,,,,,,,,,
1,1757,358.0,1860,MN,PINE,CHENGWATANA,LUSSENE,JOSEPH,103,M,...,,,,,,,,,,
2,1759,,1850,NJ,,Newark,Freeman,Flora,91,F,...,,,,,,,,,,
3,1759,,1864,CanadaWest,Essex,Windsor,LYNCH,MINTIE,105,F,...,,,,,,,,,,
4,1760,,1850,,,,Gorett,Margaret,90,F,...,,,,,,,,,,


In [5]:
df.shape

(50858, 48)

In [6]:
df = df.dropna(how = 'all')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50849 entries, 0 to 50848
Data columns (total 48 columns):
CalculatedBirthYear                              50276 non-null object
ID                                               43561 non-null object
Census.Year                                      50849 non-null object
State/Province                                   38866 non-null object
County                                           26463 non-null object
Place                                            38691 non-null object
Last.Name                                        50848 non-null object
First.Name                                       50816 non-null object
Age                                              50004 non-null object
Sex                                              48143 non-null object
Color..Race.or.Ethnicity                         50531 non-null object
lat                                              23325 non-null object
long                                   

### 2. Choose Columns to Include for Linkage

In [8]:
df.columns

Index(['CalculatedBirthYear', 'ID', 'Census.Year', 'State/Province', 'County',
       'Place', 'Last.Name', 'First.Name', 'Age', 'Sex',
       'Color..Race.or.Ethnicity', 'lat', 'long', 'address', 'MARITAL',
       'Unnamed: 15', 'WARD', 'ROLL or Sheet#', 'PROFESSION', 'Notable',
       'STREET', 'PLACEOFBIRTH', 'RELIGION',
       'NOTE these only apply to narrative answers', 'Unnamed: 24',
       'LIVING W MALE FAMILY?', 'LIVING W FEMALE FAMILY?',
       'LIVING W MALE NONFAMILY?', 'LIVING W FEMALE NONFAMILY?', 'Cannot Read',
       'Cannot Write', 'Sick', 'Relation to Head of Household',
       'Year of Immigration to Canada if an Immigrant', 'Date of Death',
       'Cause of Death', 'Rank (Military)', 'Enlistment Date',
       'Enlistment Place', 'Date Mustered Out', 'Year of this Record',
       'Unnamed: 41', 'Unnamed: 42', 'Last Name MATCH', 'First Name Match',
       'Census Year Match', 'Unnamed: 46', 'Total of Matches'],
      dtype='object')

Check the percentage of NaNs in all columns:
* Columns with no NaN values: Census.Year
* Columns with only few NaN values (less than 25%): Last.Name, First.Name, Color..Race.or.Ethnicity, CalculatedBirthYear, Age, long, address, ID, State/Province, Place.
* Columns with more than 25% but less than 50% missing values: RELIGION, PLACEOFBIRTH, ROLL or Sheet#, NOTE these only apply to narrative answers, WARD, STREET, MARITAL, County, LIVING W FEMALE FAMILY?, Sick, etc.

In [9]:
(df.isnull().sum(axis = 0)/df.shape[0]).sort_values()

Census.Year                                      0.000000
Last.Name                                        0.000020
First.Name                                       0.000649
Color..Race.or.Ethnicity                         0.006254
CalculatedBirthYear                              0.011269
Age                                              0.016618
Sex                                              0.053216
long                                             0.079766
address                                          0.099294
ID                                               0.143326
State/Province                                   0.235659
Place                                            0.239100
RELIGION                                         0.433047
PLACEOFBIRTH                                     0.433204
ROLL or Sheet#                                   0.437767
NOTE these only apply to narrative answers       0.450235
WARD                                             0.471140
STREET        

We will choose the following columns as they have few missing values and relatively constant during the life splan of the person:

1. Calculated Birth Year
2. Census Year
3. State/Province
4. County
5. Place
6. Last.Name
7. First.Name
9. Sex
10. Color..Race.or.Ethnicity
15. PLACEOFBIRTH
16. RELIGION

In [10]:
chosen_columns = ['CalculatedBirthYear', 
                  'Census.Year',
                  'State/Province',
                  'County',
                  'Place',
                  'Last.Name',
                  'First.Name',
                  'Sex',
                  'Color..Race.or.Ethnicity',
                  'PLACEOFBIRTH',
                  'RELIGION']

In [11]:
df_filtered = df[chosen_columns].rename(columns = {'CalculatedBirthYear': 'birth_year',
                                                   'Census.Year': 'census_year',
                                                   'State/Province': 'state_or_province',
                                                   'County': 'county',
                                                   'Place': 'place',
                                                   'Last.Name': 'last_name',
                                                   'First.Name': 'first_name',
                                                   'Sex': 'sex',
                                                   'Color..Race.or.Ethnicity': 'race',
                                                   'PLACEOFBIRTH': 'place_of_birth',
                                                   'RELIGION': 'religion'})

### 3. Preprocessing each column

### 3.1. Years (Birth Year and Census Year)

Transform all birth year into numeric values.

In [12]:
df_filtered['birth_year'].unique()

array([1757, 1759, 1760, 1762, 1764, 1765, 1770, 1771, 1772, 1773, 1774,
       1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785,
       1786, 1787, 1788, 1789, 1790, 1791, 1792, 1793, 1794, 1795, 1796,
       1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807,
       1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818,
       1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829,
       1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840,
       1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851,
       1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862,
       1863, 1863.5, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872,
       1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883,
       1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894,
       1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905,
       1906, 1907, 1908, 1909, 1910, 1911, 1912, 

In [13]:
def prc_census_year(row, year_col):
    "Processing census year"
    non_compliant_values = ['-', 'F']
    if row[year_col] in non_compliant_values or pd.isnull(row[year_col]):
        return np.nan
    elif is_number(row[year_col]):
        return float(row[year_col])
    elif re.findall('\d{4}', row[year_col]):
        return min([float(i) for i in re.findall('\d{4}', row[year_col])])
    else:
        return np.nan

df_filtered['birth_year'] = df_filtered.apply(lambda row: prc_census_year(row, 'birth_year'), 
                                              axis = 1)
df_filtered['census_year'] = df_filtered.apply(lambda row: prc_census_year(row, 'census_year'), 
                                               axis = 1)

In [14]:
df_filtered['birth_year'].unique()

array([1757. , 1759. , 1760. , 1762. , 1764. , 1765. , 1770. , 1771. ,
       1772. , 1773. , 1774. , 1775. , 1776. , 1777. , 1778. , 1779. ,
       1780. , 1781. , 1782. , 1783. , 1784. , 1785. , 1786. , 1787. ,
       1788. , 1789. , 1790. , 1791. , 1792. , 1793. , 1794. , 1795. ,
       1796. , 1797. , 1798. , 1799. , 1800. , 1801. , 1802. , 1803. ,
       1804. , 1805. , 1806. , 1807. , 1808. , 1809. , 1810. , 1811. ,
       1812. , 1813. , 1814. , 1815. , 1816. , 1817. , 1818. , 1819. ,
       1820. , 1821. , 1822. , 1823. , 1824. , 1825. , 1826. , 1827. ,
       1828. , 1829. , 1830. , 1831. , 1832. , 1833. , 1834. , 1835. ,
       1836. , 1837. , 1838. , 1839. , 1840. , 1841. , 1842. , 1843. ,
       1844. , 1845. , 1846. , 1847. , 1848. , 1849. , 1850. , 1851. ,
       1852. , 1853. , 1854. , 1855. , 1856. , 1857. , 1858. , 1859. ,
       1860. , 1861. , 1862. , 1863. , 1863.5, 1864. , 1865. , 1866. ,
       1867. , 1868. , 1869. , 1870. , 1871. , 1872. , 1873. , 1874. ,
      

In [15]:
df_filtered['census_year'].unique()

array([1850., 1860., 1864., 1869., 1861., 1880., 1870., 1900., 1871.,
       1891., 1910., 1851., 1899., 1881., 1920., 1855., 1912., 1911.,
       1865., 1867.,   nan, 1930., 1872., 1878., 1901., 1892., 1819.])

### 3.2. States or Provinces

In [16]:
df_filtered['state_or_province'].unique()

array(['LA', 'MN', 'NJ', 'CanadaWest', nan, 'Ontario', 'NY', 'MA', 'PA',
       'MS', 'VT', 'ME', 'MI', 'OH', 'IN', 'TN', 'VA', 'IL', 'WA', 'WI',
       'CA', 'AR', 'AL', 'MO', 'RI', 'DC', 'NH', 'MD', 'CT', 'KS', 'IA',
       'TX', 'UT', 'Canada West', 'NE', 'Pennsylvania', 'OR', 'Illinois',
       'Wisconsin', 'KY', 'District of Columbia', 'Alabama', 'NC',
       'Vermont', 'GA', 'Michigan', 'Ohio', 'DE', 'WV', 'FL',
       'Massachussetts', 'NV', 'SC', 'MT', 'OK', 'CO', 'Virginia', 'NM',
       'ID', 'HI', 'SD', 'Canada West (Ontario)', 'ND', 'AZ', 'WY',
       'toledo', 'New York', 'MIL', 'VI', 'AK', 'PR', 'ITER', 'PANA'],
      dtype=object)

In [17]:
def transform_state(row):    
    
    state_dict = {'CanadaWest': 'ON',
                  'Ontario': 'ON',
                  'Canada West': 'ON',
                  'Pennsylvania': 'PA',
                  'Illinois': 'IL',
                  'Wisconsin': 'WI',
                  'District of Columbia': 'DC',
                  'Alabama': 'AL',
                  'Vermont': 'VT',
                  'Michigan': 'MI',
                  'Ohio': 'OH',
                  'Massachussetts': 'MA',
                  'Virginia': 'VA',
                  'Canada West (Ontario)': 'ON',
                  'New York': 'NY',
                  'toledo': 'OH'}
    
    if row['state_or_province'] in state_dict.keys():
        return state_dict[row['state_or_province']]
    else:
        return row['state_or_province']

df_filtered['state_or_province'] = df_filtered\
                    .apply(lambda row: transform_state(row), axis = 1) 

In [18]:
df_filtered['state_or_province'].unique()

array(['LA', 'MN', 'NJ', 'ON', nan, 'NY', 'MA', 'PA', 'MS', 'VT', 'ME',
       'MI', 'OH', 'IN', 'TN', 'VA', 'IL', 'WA', 'WI', 'CA', 'AR', 'AL',
       'MO', 'RI', 'DC', 'NH', 'MD', 'CT', 'KS', 'IA', 'TX', 'UT', 'NE',
       'OR', 'KY', 'NC', 'GA', 'DE', 'WV', 'FL', 'NV', 'SC', 'MT', 'OK',
       'CO', 'NM', 'ID', 'HI', 'SD', 'ND', 'AZ', 'WY', 'MIL', 'VI', 'AK',
       'PR', 'ITER', 'PANA'], dtype=object)

### 3.3. Transform Races

In [19]:
def transform_race(row):    
    
    if pd.notnull(row['race']):
        row['race'] = row['race'].lower()
        row['race'] = row['race'].replace('\xa0', '')
    
    race_dict = {'mulatto(blackandwhite)': 'MIXED',
                 'm(wonancestry.com)': 'MIXED',
                 'mulatto': 'MIXED',
                 'mullato': 'MIXED',
                 'm': 'MIXED',
                 'm(winancestry.com)': 'MIXED',
                 'black': 'BLACK',
                 'b': 'BLACK',
                 'blk': 'BLACK',
                 'african': 'BLACK',
                 'dark': 'BLACK',
                 'drk': 'BLACK',
                 'african (black)': 'BLACK',
                 '“negro”': 'BLACK',
                 'negro': 'BLACK',
                 'blacj': 'BLACK', 
                 'bkj': 'BLACK', 
                 'white': 'WHITE',
                 'w': 'WHITE',
                 '[w]': 'WHITE',
                 'white': 'WHITE',
                 'white in black household': 'WHITE',
                 'white but passing': 'WHITE',
                 'ancestrysaysw': 'WHITE'}
    
    if row['race'] in race_dict.keys():
        return race_dict[row['race']]
    elif pd.notnull(row['race']):
        return 'OTHERS'
    else:
        return np.nan

df_filtered['race'] = df_filtered.apply(lambda row: transform_race(row), axis = 1)

In [20]:
df_filtered['race'].unique()

array(['MIXED', 'BLACK', 'OTHERS', nan, 'WHITE'], dtype=object)

### 3.4. Processing Text Fields

Text fields include:
1. County
2. Place
3. Last Name
4. First Name
5. Place of Birth
6. Religion

In [21]:
column_to_clean = ['county',
                   'place',
                   'last_name',
                   'first_name',
                   'race',
                   'sex',
                   'place_of_birth',
                   'religion']
for i in column_to_clean:
    df_filtered[i] = clean(df_filtered[i])

### 3.5. Create Full Name field

In [22]:
df_filtered['full_name'] = df_filtered['first_name'] + df_filtered['last_name']

### 3.6. Summary of types

In [23]:
df_filtered.dtypes

birth_year           float64
census_year          float64
state_or_province     object
county                object
place                 object
last_name             object
first_name            object
sex                   object
race                  object
place_of_birth        object
religion              object
full_name             object
dtype: object

### 4. Linkage - Block by First Name and Last Name

In [24]:
df1 = df_filtered
df2 = df_filtered

In [25]:
indexer = rl.Index()
indexer.add(Block('full_name', 'full_name'))
record_links = indexer.index(df1, df2)

In [26]:
print(len(record_links))

121618


#### 4.1. Define comparison criteria:

1. First Name and Last Name: 75% (based on the Jaro-Winkler algorithm)
2. Place of birth: 75% (based on the Jaro-Winkler algorithm)
3. State: 50% (based on the Jaro-Winkler algorithm)
4. County and Place: 30% (based on the Jaro-Winkler algorithm)
5. Exact gender match
6. Birth year: assuming Gaussian score distribution with offset & scale = 1.

In [27]:
comparer = rl.Compare()
comparer.string('first_name', 'first_name', method = 'jarowinkler', threshold = 0.8, label = 'first_name')
comparer.string('last_name', 'last_name', method = 'jarowinkler', threshold = 0.8, label = 'last_name')
comparer.string('state_or_province', 'state_or_province', method = 'jarowinkler', threshold = 0.5, label = 'state')
comparer.string('county', 'county', method = 'jarowinkler', threshold = 0.3, label = 'county')
comparer.string('place', 'place', method = 'jarowinkler', threshold = 0.3, label = 'place')
comparer.string('place_of_birth', 'place_of_birth', method = 'jarowinkler', threshold = 0.6, label = 'place_of_birth')
comparer.exact('race', 'race', label = 'race')
comparer.exact('sex', 'sex', label = 'sex')
comparer.numeric('birth_year', 'birth_year', 
                 method = 'gauss', 
                 offset = 1, 
                 scale = 1, 
                 label = 'birth_year')

<Compare>

In [28]:
compare_vectors_rl = comparer.compute(record_links, df1, df2)
compare_vectors_rl[0:20]

Unnamed: 0,Unnamed: 1,first_name,last_name,state,county,place,place_of_birth,race,sex,birth_year
0,0,1.0,1.0,1.0,0.0,1.0,1.0,1,1,1.0
1,1,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0
2,2,1.0,1.0,1.0,0.0,1.0,1.0,1,1,1.0
3,3,1.0,1.0,1.0,1.0,1.0,1.0,1,1,1.0
4,4,1.0,1.0,0.0,0.0,0.0,1.0,1,1,1.0
5,5,1.0,1.0,1.0,1.0,1.0,1.0,1,1,1.0
6,6,1.0,1.0,0.0,0.0,0.0,1.0,1,1,1.0
6,5201,1.0,1.0,0.0,0.0,0.0,0.0,1,1,0.0
6,6528,1.0,1.0,0.0,0.0,0.0,0.0,0,1,0.0
6,6529,1.0,1.0,0.0,0.0,0.0,0.0,0,1,0.0


#### 4.2. Define selection criteria

1. First Name and Last Name match
2. Place of birth match
3. State match
4. Exact gender match
5. Birth year match

In [29]:
result_rl = compare_vectors_rl[((compare_vectors_rl['first_name'] == 1.0) | 
                                (compare_vectors_rl['last_name'] == 1.0)) &
                                (compare_vectors_rl['sex'] == 1) &
                                (compare_vectors_rl['race'] == 1) &
                                (compare_vectors_rl['state'] == 1.0) &
                                (compare_vectors_rl['birth_year'] > compare_vectors_rl['birth_year'].mean())].\
                                reset_index()
result_rl = result_rl[result_rl['level_0'] != result_rl['level_1']].reset_index()
result_rl.drop('index', axis = 1, inplace = True)

In [30]:
result_rl

Unnamed: 0,level_0,level_1,first_name,last_name,state,county,place,place_of_birth,race,sex,birth_year
0,24451,25502,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0
1,25502,24451,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0
2,25503,25504,1.0,1.0,1.0,0.0,1.0,0.0,1,1,1.0
3,25504,25503,1.0,1.0,1.0,0.0,1.0,0.0,1,1,1.0
4,35410,36261,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...
2287,49813,49812,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0
2288,50040,50041,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0
2289,50041,50040,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0
2290,50211,50212,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0


#### 4.3. Final Processing

* Create a combined score (sum of all scores).
* Rank the matched records by this combined score, grouped by each person.
* Only select the best match for each person.

In [31]:
result_rl['combined_score'] = result_rl.iloc[:, 2:].sum(axis = 1)
result_rl['combined_score_rank'] = result_rl.groupby('level_0')['combined_score'].rank("dense", ascending = False)
result_rl = result_rl[result_rl['combined_score_rank'] == 1.0]

In [32]:
df_result_rl = pd.DataFrame()
for i in zip(result_rl['level_0'], result_rl['level_1']):
    df_result_rl = df_result_rl.append(df_filtered.iloc[i[0]])
    df_result_rl = df_result_rl.append(df_filtered.iloc[i[1]])   
df_result_rl.reset_index(inplace = True)
df_result_rl = df_result_rl.drop('index', axis = 1)

In [33]:
num_list = []
for i in range(int(df_result_rl.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_rl['matched_pair'] = pd.Series(num_list)

In [34]:
df_result_rl

Unnamed: 0,birth_year,census_year,county,first_name,full_name,last_name,place,place_of_birth,race,religion,sex,state_or_province,matched_pair
0,1857.0,1920.0,lake,sarah,sarahbrown,brown,painsville,,black,,f,OH,0
1,1858.0,1910.0,ashtabula,sarah,sarahbrown,brown,geneva,,black,,f,OH,0
2,1858.0,1910.0,ashtabula,sarah,sarahbrown,brown,geneva,,black,,f,OH,1
3,1857.0,1920.0,lake,sarah,sarahbrown,brown,painsville,,black,,f,OH,1
4,1858.0,1900.0,worcester,sarah,sarahbrown,brown,worcester,,black,,f,MA,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4463,1910.0,1910.0,franklin,manuel,manuelleitchfield,leitchfield,frankfort,,black,,m,KY,2231
4464,1911.0,1920.0,penobscot,arvilla,arvillamcintire,mcintire,bangor,,black,,f,ME,2232
4465,1911.0,1920.0,penobscot,arvilla,arvillamcintire,mcintire,bangor,,black,,f,ME,2232
4466,1911.0,1920.0,penobscot,arvilla,arvillamcintire,mcintire,bangor,,black,,f,ME,2233


In [35]:
df_result_rl.to_csv('result_fullname_linkage.csv')

### 5. Linkage - SortedNeighbourhood

In [36]:
indexer = rl.Index()
indexer.add(SortedNeighbourhood('full_name', 'full_name'))
record_links = indexer.index(df1, df2)

In [37]:
print(len(record_links))

256520


#### 5.1. Comparison Criteria

In [38]:
comparer = rl.Compare()
comparer.string('first_name', 'first_name', method = 'jarowinkler', threshold = 0.8, label = 'first_name')
comparer.string('last_name', 'last_name', method = 'jarowinkler', threshold = 0.8, label = 'last_name')
comparer.string('state_or_province', 'state_or_province', method = 'jarowinkler', threshold = 0.5, label = 'state')
comparer.string('county', 'county', method = 'jarowinkler', threshold = 0.3, label = 'county')
comparer.string('place', 'place', method = 'jarowinkler', threshold = 0.3, label = 'place')
comparer.string('place_of_birth', 'place_of_birth', method = 'jarowinkler', threshold = 0.6, label = 'place_of_birth')
comparer.exact('race', 'race', label = 'race')
comparer.exact('sex', 'sex', label = 'sex')

comparer.numeric('birth_year', 'birth_year', 
                 method = 'gauss', 
                 offset = 1, 
                 scale = 1, 
                 label = 'birth_year')

<Compare>

In [39]:
compare_vectors_sn = comparer.compute(record_links, df1, df2)
compare_vectors_sn[0:20]

Unnamed: 0,Unnamed: 1,first_name,last_name,state,county,place,place_of_birth,race,sex,birth_year
0,41902,0.0,0.0,0.0,0.0,1.0,0.0,1,0,0.0
1,50171,1.0,0.0,1.0,1.0,1.0,0.0,0,1,0.0
2,37255,1.0,0.0,1.0,0.0,1.0,1.0,1,1,0.0
3,2887,1.0,0.0,0.0,0.0,0.0,1.0,0,1,0.0
4,39779,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0.0
5,18281,1.0,0.0,0.0,0.0,1.0,1.0,0,0,0.0
6,15152,1.0,1.0,0.0,0.0,0.0,0.0,0,1,0.0
6,24621,1.0,1.0,0.0,0.0,0.0,0.0,1,1,0.0
5201,15152,1.0,1.0,1.0,0.0,1.0,1.0,0,1,1.0053819999999999e-87
5201,24621,1.0,1.0,0.0,0.0,0.0,1.0,1,1,9.828412999999999e-237


#### 5.2. Results

In [40]:
result_sn = compare_vectors_sn[((compare_vectors_sn['first_name'] == 1.0) | (compare_vectors_sn['last_name'] == 1.0)) &
                               (compare_vectors_sn['race'] == 1) &
                               (compare_vectors_sn['sex'] == 1) &
                               (compare_vectors_sn['state'] == 1.0) &
                               (compare_vectors_sn['birth_year'] > compare_vectors_sn['birth_year'].mean())].\
                               reset_index()
result_sn = result_sn[result_sn['level_0'] != result_sn['level_1']].reset_index()
result_sn.drop('index', axis = 1, inplace = True)

In [41]:
result_sn

Unnamed: 0,level_0,level_1,first_name,last_name,state,county,place,place_of_birth,race,sex,birth_year
0,358,359,1.0,0.0,1.0,0.0,1.0,0.0,1,1,1.0
1,564,616,1.0,1.0,1.0,1.0,0.0,0.0,1,1,1.0
2,11597,11598,1.0,0.0,1.0,1.0,1.0,0.0,1,1,1.0
3,682,683,1.0,0.0,1.0,1.0,1.0,0.0,1,1,1.0
4,684,685,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...
3809,48229,48496,1.0,1.0,1.0,0.0,1.0,0.0,1,1,0.5
3810,49259,49518,1.0,1.0,1.0,1.0,1.0,0.0,1,1,1.0
3811,49369,49810,1.0,0.0,1.0,1.0,1.0,0.0,1,1,0.5
3812,50053,49815,1.0,0.0,1.0,0.0,1.0,0.0,1,1,1.0


#### 5.3. Final Processing

In [42]:
result_sn['combined_score'] = result_sn.iloc[:, 2:].sum(axis = 1)
result_sn['combined_score_rank'] = result_sn.groupby('level_0')['combined_score'].rank("dense", ascending = False)
result_sn = result_sn[result_sn['combined_score_rank'] == 1.0]

In [43]:
df_result_sn = pd.DataFrame()
for i in zip(result_sn['level_0'], result_sn['level_1']):
    df_result_sn = df_result_sn.append(df_filtered.iloc[i[0]])
    df_result_sn = df_result_sn.append(df_filtered.iloc[i[1]])   
df_result_sn.reset_index(inplace = True)
df_result_sn = df_result_sn.drop('index', axis = 1)

In [44]:
num_list = []
for i in range(int(df_result_sn.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_sn['matched_pair'] = pd.Series(num_list)

In [45]:
df_result_sn

Unnamed: 0,birth_year,census_year,county,first_name,full_name,last_name,place,place_of_birth,race,religion,sex,state_or_province,matched_pair
0,1798.0,1850.0,,diana,dianagears,gears,new york,canada,black,,f,NY,0
1,1798.0,1850.0,,diana,dianagelloson,gelloson,new york,usa,black,,f,NY,0
2,1802.0,1860.0,aroostook,thomas,thomasbarnet,barnet,houlton,,black,,m,ME,1
3,1803.0,1870.0,aroostook,thomas,thomasbarnett,barnett,linneus,,black,,m,ME,1
4,1841.0,1870.0,wayne,mary,marymoore,moore,6 wddetroit,,mixed,,f,MI,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7103,1909.0,1910.0,kings,elizabeth,elizabethkearner,kearner,brooklyn,,mixed,,f,NY,3551
7104,1910.0,1910.0,wayne,edward,edwardmadison,madison,detroit,,black,,m,MI,3552
7105,1909.0,1910.0,suffolk,edward,edwardlynch,lynch,boston,,black,,m,MA,3552
7106,1910.0,1910.0,middlesex,johnj,johnjshephard,shephard,malden,,black,,m,MA,3553


In [46]:
df_result_sn.to_csv('result_fullname_linkage_sortedneighbourhood.csv')

### 6. Unsupervised Learning

### 6.1. Processing data for predictions

In [47]:
output_columns = ['id1',
                  'id2',
                  'first_name', 
                  'last_name', 
                  'state', 
                  'race', 
                  'county', 
                  'place', 
                  'place_of_birth', 
                  'sex', 
                  'birth_year']

In [48]:
result_rl_for_preds = compare_vectors_sn.reset_index().rename(columns = {'level_0': 'id1',
                                                                         'level_1': 'id2'})[output_columns]
result_rl_for_preds = result_rl_for_preds.set_index(['id1', 'id2'])

In [49]:
result_sn_for_preds = compare_vectors_rl.reset_index().rename(columns = {'level_0': 'id1',
                                                                         'level_1': 'id2'})[output_columns]
result_sn_for_preds = result_sn_for_preds.set_index(['id1', 'id2'])

### 6.2. K-means

In [50]:
kmeans = rl.KMeansClassifier()
result_kmeans_rl = kmeans.fit_predict(result_rl_for_preds)

In [51]:
len(result_kmeans_rl)

122344

In [None]:
df_result_rl_kmeans = pd.DataFrame()
for i in result_kmeans_rl:
    df_result_rl_kmeans = df_result_rl_kmeans.append(df_filtered.iloc[i[0]])
    df_result_rl_kmeans = df_result_rl_kmeans.append(df_filtered.iloc[i[1]])   
df_result_rl_kmeans.reset_index(inplace = True)
df_result_rl_kmeans = df_result_rl_kmeans.drop('index', axis = 1)

num_list = []
for i in range(int(df_result_rl_kmeans.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_rl_kmeans['matched_pair'] = pd.Series(num_list)

df_result_rl_kmeans.to_csv('result_fullname_linkage_kmeans.csv')

In [52]:
kmeans = rl.KMeansClassifier()
result_kmeans_sn = kmeans.fit_predict(result_sn_for_preds)

In [53]:
len(result_kmeans_sn)

66768

In [None]:
df_result_sn_kmeans = pd.DataFrame()
for i in result_kmeans_sn:
    df_result_sn_kmeans = df_result_sn_kmeans.append(df_filtered.iloc[i[0]])
    df_result_sn_kmeans = df_result_sn_kmeans.append(df_filtered.iloc[i[1]])   
df_result_sn_kmeans.reset_index(inplace = True)
df_result_sn_kmeans = df_result_sn_kmeans.drop('index', axis = 1)

num_list = []
for i in range(int(df_result_sn_kmeans.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_sn_kmeans['matched_pair'] = pd.Series(num_list)

df_result_rl_kmeans.to_csv('result_fullname_linkage_kmeans_sortedneighbourhood.csv')

### 6.3. Expectation/Conditional Maximization Algorithm

In [54]:
ecm = rl.ECMClassifier(binarize = 0.8)
result_ecm_rl = ecm.fit_predict(result_rl_for_preds)

In [55]:
len(result_ecm_rl)

54203

In [None]:
df_result_rl_ecm = pd.DataFrame()
for i in result_ecm_rl:
    df_result_rl_ecm = df_result_rl_ecm.append(df_filtered.iloc[i[0]])
    df_result_rl_ecm = df_result_rl_ecm.append(df_filtered.iloc[i[1]])   
df_result_rl_ecm.reset_index(inplace = True)
df_result_rl_ecm = df_result_rl_ecm.drop('index', axis = 1)

num_list = []
for i in range(int(df_result_rl_ecm.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_rl_ecm['matched_pair'] = pd.Series(num_list)

df_result_rl_ecm.to_csv('result_fullname_linkage_ecm.csv')

In [56]:
ecm = rl.ECMClassifier(binarize = 0.8)
result_ecm_sn = ecm.fit_predict(result_sn_for_preds)

In [57]:
len(result_ecm_sn)

53352

In [None]:
df_result_sn_ecm = pd.DataFrame()
for i in result_ecm_sn:
    df_result_sn_ecm = df_result_sn_ecm.append(df_filtered.iloc[i[0]])
    df_result_sn_ecm = df_result_sn_ecm.append(df_filtered.iloc[i[1]])   
df_result_sn_ecm.reset_index(inplace = True)
df_result_sn_ecm = df_result_sn_ecm.drop('index', axis = 1)

num_list = []
for i in range(int(df_result_sn_ecm.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
df_result_sn_ecm['matched_pair'] = pd.Series(num_list)

df_result_sn_ecm.to_csv('result_fullname_linkage_ecm_sortedneighbourhood.csv')