### 1. Setting Up

This is a preliminary work to identify records that belong to each person using the recordlinkage package.

In [1]:
import pandas as pd
import numpy as np

# Record Linkage
import recordlinkage as rl
from recordlinkage.index import Block
from recordlinkage.preprocessing import clean

# Regular expression operations
import re

In [2]:
# Helper functions
def is_number(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

In [3]:
df = pd.read_excel("1860-1930 database for family tracking Adam manual AFAM CANADA current.xlsx")
df.head(5)

Unnamed: 0,CalculatedBirthYear,ID,Census.Year,State/Province,County,Place,Last.Name,First.Name,Age,Sex,...,Enlistment Place,Date Mustered Out,Year of this Record,Unnamed: 41,Unnamed: 42,Last Name MATCH,First Name Match,Census Year Match,Unnamed: 46,Total of Matches
0,1757,,1850,LA,,New Orleans,Jeffries,?Ucky,93,M,...,,,,,,,,,,
1,1757,358.0,1860,MN,PINE,CHENGWATANA,LUSSENE,JOSEPH,103,M,...,,,,,,,,,,
2,1759,,1850,NJ,,Newark,Freeman,Flora,91,F,...,,,,,,,,,,
3,1759,,1864,CanadaWest,Essex,Windsor,LYNCH,MINTIE,105,F,...,,,,,,,,,,
4,1760,,1850,,,,Gorett,Margaret,90,F,...,,,,,,,,,,


In [4]:
df.shape

(50858, 48)

In [5]:
df = df.dropna(how = 'all')

### 2. Choose Columns to Include for Linkage

In [6]:
df.columns

Index(['CalculatedBirthYear', 'ID', 'Census.Year', 'State/Province', 'County',
       'Place', 'Last.Name', 'First.Name', 'Age', 'Sex',
       'Color..Race.or.Ethnicity', 'lat', 'long', 'address', 'MARITAL',
       'Unnamed: 15', 'WARD', 'ROLL or Sheet#', 'PROFESSION', 'Notable',
       'STREET', 'PLACEOFBIRTH', 'RELIGION',
       'NOTE these only apply to narrative answers', 'Unnamed: 24',
       'LIVING W MALE FAMILY?', 'LIVING W FEMALE FAMILY?',
       'LIVING W MALE NONFAMILY?', 'LIVING W FEMALE NONFAMILY?', 'Cannot Read',
       'Cannot Write', 'Sick', 'Relation to Head of Household',
       'Year of Immigration to Canada if an Immigrant', 'Date of Death',
       'Cause of Death', 'Rank (Military)', 'Enlistment Date',
       'Enlistment Place', 'Date Mustered Out', 'Year of this Record',
       'Unnamed: 41', 'Unnamed: 42', 'Last Name MATCH', 'First Name Match',
       'Census Year Match', 'Unnamed: 46', 'Total of Matches'],
      dtype='object')

Check the percentage of NaNs in all columns:
* Columns with no NaN values: Census.Year
* Columns with only few NaN values (less than 25%): Last.Name, First.Name, Color..Race.or.Ethnicity, CalculatedBirthYear, Age, long, address, ID, State/Province, Place.
* Columns with more than 25% but less than 50% missing values: RELIGION, PLACEOFBIRTH, ROLL or Sheet#, NOTE these only apply to narrative answers, WARD, STREET, MARITAL, County, LIVING W FEMALE FAMILY?, Sick, etc.

In [7]:
(df.isnull().sum(axis = 0)/df.shape[0]).sort_values()

Census.Year                                      0.000000
Last.Name                                        0.000020
First.Name                                       0.000649
Color..Race.or.Ethnicity                         0.006254
CalculatedBirthYear                              0.011269
Age                                              0.016618
Sex                                              0.053216
long                                             0.079766
address                                          0.099294
ID                                               0.143326
State/Province                                   0.235659
Place                                            0.239100
RELIGION                                         0.433047
PLACEOFBIRTH                                     0.433204
ROLL or Sheet#                                   0.437767
NOTE these only apply to narrative answers       0.450235
WARD                                             0.471140
STREET        

We will choose the following columns as they have few missing values and relatively constant during the life splan of the person:

1. Calculated Birth Year
3. State/Province
4. County
5. Place
6. Last.Name
7. First.Name
9. Sex
10. Color..Race.or.Ethnicity
15. PLACEOFBIRTH
16. RELIGION

In [8]:
df_filtered = df[['CalculatedBirthYear', 
                  'State/Province',
                  'County',
                  'Place',
                  'Last.Name',
                  'First.Name',
                  'Sex',
                  'Color..Race.or.Ethnicity',
                  'PLACEOFBIRTH',
                  'RELIGION']].rename(columns = {'CalculatedBirthYear': 'birth_year',
                                                 'State/Province': 'state_or_province',
                                                 'County': 'county',
                                                 'Place': 'place',
                                                 'Last.Name': 'last_name',
                                                 'First.Name': 'first_name',
                                                 'Sex': 'sex',
                                                 'Color..Race.or.Ethnicity': 'race',
                                                 'PLACEOFBIRTH': 'place_of_birth',
                                                 'RELIGION': 'religion'})

### 3. Preprocessing each column

### 3.1. Birth Year

Transform all birth year into numeric values.

In [9]:
df_filtered['birth_year'].unique()

array([1757, 1759, 1760, 1762, 1764, 1765, 1770, 1771, 1772, 1773, 1774,
       1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1785,
       1786, 1787, 1788, 1789, 1790, 1791, 1792, 1793, 1794, 1795, 1796,
       1797, 1798, 1799, 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807,
       1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818,
       1819, 1820, 1821, 1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829,
       1830, 1831, 1832, 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840,
       1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851,
       1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862,
       1863, 1863.5, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872,
       1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883,
       1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894,
       1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905,
       1906, 1907, 1908, 1909, 1910, 1911, 1912, 

In [14]:
def transform_birth_year(row):
    
    non_compliant_values = ['-', 'F']
    
    if row['birth_year'] in non_compliant_values or pd.isnull(row['birth_year']):
        return np.nan
    elif is_number(row['birth_year']):
        return float(row['birth_year'])
    else: 
        return float(re.findall('\d{4}$', row['birth_year'])[0])

df_filtered['birth_year'] = df_filtered.apply(lambda row: transform_birth_year(row), axis = 1)

In [15]:
df_filtered['birth_year'].unique()

array([1757. , 1759. , 1760. , 1762. , 1764. , 1765. , 1770. , 1771. ,
       1772. , 1773. , 1774. , 1775. , 1776. , 1777. , 1778. , 1779. ,
       1780. , 1781. , 1782. , 1783. , 1784. , 1785. , 1786. , 1787. ,
       1788. , 1789. , 1790. , 1791. , 1792. , 1793. , 1794. , 1795. ,
       1796. , 1797. , 1798. , 1799. , 1800. , 1801. , 1802. , 1803. ,
       1804. , 1805. , 1806. , 1807. , 1808. , 1809. , 1810. , 1811. ,
       1812. , 1813. , 1814. , 1815. , 1816. , 1817. , 1818. , 1819. ,
       1820. , 1821. , 1822. , 1823. , 1824. , 1825. , 1826. , 1827. ,
       1828. , 1829. , 1830. , 1831. , 1832. , 1833. , 1834. , 1835. ,
       1836. , 1837. , 1838. , 1839. , 1840. , 1841. , 1842. , 1843. ,
       1844. , 1845. , 1846. , 1847. , 1848. , 1849. , 1850. , 1851. ,
       1852. , 1853. , 1854. , 1855. , 1856. , 1857. , 1858. , 1859. ,
       1860. , 1861. , 1862. , 1863. , 1863.5, 1864. , 1865. , 1866. ,
       1867. , 1868. , 1869. , 1870. , 1871. , 1872. , 1873. , 1874. ,
      

### 3.2 Text-type columns

#### 3.4.1. State or Province

In [16]:
df_filtered['state_or_province'].unique()

array(['LA', 'MN', 'NJ', 'CanadaWest', nan, 'Ontario', 'NY', 'MA', 'PA',
       'MS', 'VT', 'ME', 'MI', 'OH', 'IN', 'TN', 'VA', 'IL', 'WA', 'WI',
       'CA', 'AR', 'AL', 'MO', 'RI', 'DC', 'NH', 'MD', 'CT', 'KS', 'IA',
       'TX', 'UT', 'Canada West', 'NE', 'Pennsylvania', 'OR', 'Illinois',
       'Wisconsin', 'KY', 'District of Columbia', 'Alabama', 'NC',
       'Vermont', 'GA', 'Michigan', 'Ohio', 'DE', 'WV', 'FL',
       'Massachussetts', 'NV', 'SC', 'MT', 'OK', 'CO', 'Virginia', 'NM',
       'ID', 'HI', 'SD', 'Canada West (Ontario)', 'ND', 'AZ', 'WY',
       'toledo', 'New York', 'MIL', 'VI', 'AK', 'PR', 'ITER', 'PANA'],
      dtype=object)

In [17]:
def transform_state(row):    
    
    state_dict = {'CanadaWest': 'ON',
                  'Ontario': 'ON',
                  'Canada West': 'ON',
                  'Pennsylvania': 'PA',
                  'Illinois': 'IL',
                  'Wisconsin': 'WI',
                  'District of Columbia': 'DC',
                  'Alabama': 'AL',
                  'Vermont': 'VT',
                  'Michigan': 'MI',
                  'Ohio': 'OH',
                  'Massachussetts': 'MA',
                  'Virginia': 'VA',
                  'Canada West (Ontario)': 'ON',
                  'New York': 'NY',
                  'toledo': 'OH'}
    
    if row['state_or_province'] in state_dict.keys():
        return state_dict[row['state_or_province']]
    else:
        return row['state_or_province']

df_filtered['state_or_province'] = df_filtered.apply(lambda row: transform_state(row), axis = 1) 

In [18]:
df_filtered['state_or_province'].unique()[0:20]

array(['LA', 'MN', 'NJ', 'ON', nan, 'NY', 'MA', 'PA', 'MS', 'VT', 'ME',
       'MI', 'OH', 'IN', 'TN', 'VA', 'IL', 'WA', 'WI', 'CA'], dtype=object)

In [19]:
df_filtered["county"].unique()[0:20]

array([nan, 'PINE', 'Essex', 'Kent', 'Kingston', 'MIDDLESEX', 'LANCASTER',
       'ADAMS', 'WAYNE', 'ESSEX', 'Hamilton', 'LIVINGSTON', 'MACKINAC',
       'London', 'SUFFOLK', 'NEWYORK', 'KINGS', 'NORFOLK', 'DAVIDSON',
       'CUMBERLAND'], dtype=object)

In [20]:
df_filtered["place"].unique()[0:20]

array(['New Orleans', 'CHENGWATANA', 'Newark', 'Windsor', nan,
       'Township of Chatham and Gore', 'Rochester', 'Victoria Ward',
       'New York', 'CAMBRIDGE', 'COLUMBIA', 'NATCHEZPO', 'Philadephia',
       'Burlington', 'Bangor', 'DETROIT', 'Portland', 'New Bedford',
       'LYNN', 'Lynn'], dtype=object)

In [21]:
df_filtered["place_of_birth"].unique()[0:20]

array(['Canada', nan, 'USA', 'SOUTHCAROLINA', 'Virginia', 'UC',
       'UnitedKingdom,n.s.', 'OtherCaribbeanandn.s.', 99999, 'VIRGINIA',
       'WIN', 'VA', 'Ireland', 22060, 'NY', 'CW', 'U.S.', 24990,
       'U. States', 19999], dtype=object)

In [22]:
df_filtered["sex"].unique()

array(['M', 'F', nan, 'F ', 'W', 'G', 'K', 'H', 'I', 'f?', 'light', 'L',
       'fair'], dtype=object)

In [25]:
column_to_clean = ['state_or_province',
                   'county',
                   'place',
                   'last_name',
                   'first_name',
                   'sex',
                   'race',
                   'place_of_birth', 
                   'religion']
for i in column_to_clean:
    df_filtered[i] = clean(df_filtered[i])

### 3.3. Summary of types

In [26]:
df_filtered.dtypes

birth_year           float64
state_or_province     object
county                object
place                 object
last_name             object
first_name            object
sex                   object
race                  object
place_of_birth        object
religion              object
dtype: object

### 4. Linkage

In [27]:
df1 = df_filtered
df2 = df_filtered

In [28]:
indexer = rl.Index()
indexer.add(Block('first_name', 'last_name'))
record_links = indexer.index(df1, df2)

  verify_integrity=False)


In [29]:
print(len(record_links))

743289


#### 4.1. Define comparison criteria:

1. First Name and Last Name: 75% (based on the Jaro-Winkler algorithm)
2. Place of birth: 75% (based on the Jaro-Winkler algorithm)
3. State: 50% (based on the Jaro-Winkler algorithm)
4. County and Place: 30% (based on the Jaro-Winkler algorithm)
5. Exact gender match
6. Birth year: assuming Gaussian score distribution with offset & scale = 1.

In [30]:
comparer = rl.Compare()
comparer.string('first_name', 'first_name', method = 'jarowinkler', threshold = 0.75, label = 'first_name')
comparer.string('last_name', 'last_name', method = 'jarowinkler', threshold = 0.75, label = 'last_name')
comparer.string('state_or_province', 'state_or_province', method = 'jarowinkler', threshold = 0.5, label = 'state')
comparer.string('county', 'county', method = 'jarowinkler', threshold = 0.3, label = 'county')
comparer.string('place', 'place', method = 'jarowinkler', threshold = 0.3, label = 'place')
comparer.string('place_of_birth', 'place_of_birth', method = 'jarowinkler', threshold = 0.75, label = 'place_of_birth')
comparer.exact('sex', 'sex', label = 'sex')
comparer.numeric('birth_year', 'birth_year', 
                 method = 'gauss', 
                 offset = 1, 
                 scale = 1, 
                 label = 'birth_year')

<Compare>

In [31]:
compare_vectors = comparer.compute(record_links, df1, df2)
compare_vectors[0:20]

Unnamed: 0,Unnamed: 1,first_name,last_name,state,county,place,place_of_birth,sex,birth_year
1,5611,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0
1,6093,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0
1,9182,0.0,0.0,1.0,0.0,1.0,0.0,0,0.0
1,12229,0.0,0.0,1.0,0.0,1.0,0.0,1,0.0
1,20093,0.0,0.0,0.0,1.0,1.0,0.0,0,0.0
1,20094,0.0,0.0,0.0,0.0,1.0,0.0,0,0.0
1,24870,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0
1,25974,0.0,0.0,0.0,1.0,1.0,0.0,1,0.0
1,50583,0.0,0.0,1.0,0.0,1.0,0.0,0,0.0
58,5611,0.0,0.0,0.0,1.0,1.0,0.0,0,0.0


#### 4.2. Define selection criteria

1. First Name and Last Name match
2. Place of birth match
3. State match
4. Exact gender match
5. Birth year match

In [32]:
result = compare_vectors[((compare_vectors['first_name'] == 1.0) | (compare_vectors['last_name'] == 1.0)) &
                         (compare_vectors['sex'] == 1) &
                         (compare_vectors['state'] == 1.0) &
                         (compare_vectors['birth_year'] > compare_vectors['birth_year'].mean())].reset_index()
result = result[result['level_0'] != result['level_1']].reset_index()
result.drop('index', axis = 1, inplace = True)

In [33]:
result

Unnamed: 0,level_0,level_1,first_name,last_name,state,county,place,place_of_birth,sex,birth_year
0,4964,4645,0.0,1.0,1.0,0.0,1.0,0.0,1,1.0000
1,42251,41453,0.0,1.0,1.0,1.0,1.0,0.0,1,1.0000
2,43041,41453,0.0,1.0,1.0,1.0,1.0,0.0,1,0.5000
3,14434,14438,0.0,1.0,1.0,0.0,1.0,1.0,1,1.0000
4,29261,27049,0.0,1.0,1.0,1.0,1.0,0.0,1,0.5000
5,29261,28157,0.0,1.0,1.0,1.0,1.0,0.0,1,1.0000
6,29261,30914,0.0,1.0,1.0,1.0,1.0,0.0,1,0.5000
7,29261,31641,0.0,1.0,1.0,1.0,1.0,0.0,1,0.0625
8,29261,31643,0.0,1.0,1.0,1.0,1.0,0.0,1,0.0625
9,29261,31644,0.0,1.0,1.0,1.0,1.0,0.0,1,0.0625


#### 4.3. Final Processing

* Create a combined score (sum of all scores).
* Rank the matched records by this combined score, grouped by each person.
* Only select the best match for each person.

In [34]:
result['combined_score'] = result.iloc[:, 2:].sum(axis = 1)
result['combined_score_rank'] = result.groupby('level_0')['combined_score'].rank("dense", ascending = False)
result = result[result['combined_score_rank'] == 1.0]

In [35]:
df_result = pd.DataFrame()

for i in zip(result['level_0'], result['level_1']):
    
    df_result = df_result.append(df_filtered.iloc[i[0]])
    df_result = df_result.append(df_filtered.iloc[i[1]])   

df_result.reset_index(inplace = True)
df_result = df_result.drop('index', axis = 1)

In [36]:
num_list = []

for i in range(int(df_result.shape[0]/2)):
    num_list.append(i)
    num_list.append(i)
    
df_result['matched_pair'] = pd.Series(num_list)

In [38]:
df_result

Unnamed: 0,birth_year,county,first_name,last_name,place,place_of_birth,race,religion,sex,state_or_province,matched_pair
0,1827.0,,john,johnson,chicago,usa,black,,m,il,0
1,1826.0,russell,allen,john,girard,,b,,m,al,0
2,1878.0,wayne,john,johnson,detroit,usa,black,,m,mi,1
3,1877.0,panola,borooks,john,batesville,,b,,m,ms,1
4,1879.0,wayne,john,jones,detroit,,b,,m,mi,2
5,1877.0,panola,borooks,john,batesville,,b,,m,ms,2
6,1845.0,,james,jakson,erie,usa,mulatto,,m,pa,3
7,1845.0,,george,james,allegheny,usa,black,,m,pa,3
8,1861.0,wayne,james,jones,detroit,usa,black,,m,mi,4
9,1860.0,washtenaw,chasa,james,ypsilanti,,b,,m,mi,4


In [39]:
df_result.to_csv('prelim_result.csv')