In [None]:
# Author: Sophie Bair
# Date: 1/4/20

# Description: now that we have marked matches to use as a training dataset, see if
# a supervised learning model could be effective

# Method
# 1. import and clean data
# 2. create train and test datasets from identified matches where each row is a pair of two records, the x variables
#    are the similarity between the two records and the y variable is a 1 for an identified 
#    match and a 0 for a non-match
# 3. train a model on training data (in this case, used logistic regression)
# 4. evaluate model on test data
# 5. predict new matches by fitting the model onto data that does not have an already 
#    identified match 
# 6. visually inspect matches to re-evaluate model 

# Next steps 
# -expand training data to all of the pairs (only used 500 for the sake of time)
# -make code more efficient (creating the matrix of the data takes a while to run)
# -try different machine learning models 
# -do variable selection to determine which variables we really need in the model (I used a limited selection here)
# -figure out best way to deal with missing values, especially birth year missing values
# -incorporate more sophisticated analysis of the name similarity (eg giving more weight to rare
#  names, inputting a database of common nicknames, etc.)


In [3]:
import numpy as np
import pandas as pd
import recordlinkage as rl

In [91]:
data = pd.read_csv("full_data_with_match_ids_SB_1_5_20.csv")

In [95]:
# Data cleaning
#turn strings into lowercase and make sure things that represent years are only numbers
from recordlinkage.preprocessing import clean
for col in ['State/Province', 'Count', 'Place', 'Last.Name', 'First.Name', 'Sex']:
    data[col] = clean(data[col],replace_by_whitespace = '[^a-zA-Z]')
    data[col] = data[col].apply(lambda x: x if pd.isna(x) else str(x).strip())
    
data['CalculatedBirthYear'] = clean(data['CalculatedBirthYear'], replace_by_whitespace = '[^\\-\\0-9]') 
data['CalculatedBirthYear'] = data['CalculatedBirthYear'].apply(lambda x: int(x) if str(x)[0:4].isdigit() else x)

In [107]:
# replace missing values and strings in birth year with the average of the years (maybe change later)
sum = 0 
count = 0
for year in data['CalculatedBirthYear']:
    if isinstance(year, int):
        sum = sum + year
        count = count + 1
year_mean = sum/count

data['CalculatedBirthYear'] = data['CalculatedBirthYear'].fillna(year_mean)
data['CalculatedBirthYear'] = data['CalculatedBirthYear'].apply(lambda x: year_mean if isinstance(x, str) else int(x))

In [115]:
data.head()

Unnamed: 0.1,Unnamed: 0,Joint ID for Matched Records,Where is Match from?,CalculatedBirthYear,ID,Census.Year,State/Province,Count,Place,Last.Name,...,Unnamed: 236,Unnamed: 237,Unnamed: 238,Unnamed: 239,Unnamed: 240,Unnamed: 241,Unnamed: 242,Unnamed: 243,Unnamed: 244,Unnamed: 248
0,31306,1.0,Original match set (AA),1864,9247,1900,me,washington,machias,addison,...,,,,,,,,,,
1,31307,1.0,Original match set (AA),1864,,1864,canadawest,essex,windsor,addison,...,,,,,,,,,,
2,32004,2.0,Original match set (AA),1864,,1864,canadawest,essex,windsor,weeks,...,,,,,,,,,,
3,32005,2.0,Original match set (AA),1864,1880 IPUMS 100% sample,1880,mi,wayne,detroit,weeks,...,,,,,,,,,,
4,64,3.0,Original match set (AA),1782,,1864,canadawest,essex,windsor,green,...,,,,,,,,,,


In [235]:
import textdistance

In [236]:
# Loop through all possible matches for the first 500 records to create training dataset

# for each variable, compute string or integer distance and put into new matrix. This matrix
# will be fed into the logistic regression model 

unmarked_match_data = pd.DataFrame(
    np.nan, index = range(124750), columns = 
    ['Index1', 'Index2', 'Last.Name', 'First.Name', 'CalculatedBirthYear', 'State/Province', 'Count', 'Place', 'Sex', 'Match'])

count = 0 
for i in range(500):
    for j in range(i + 1, 500):
        data_1 = data.iloc[i, :]
        data_2 = data.iloc[j, :]
        
        new_match_data.loc[count, 'Index1'] = i
        new_match_data.loc[count, 'Index2'] = j
        
        # last name
        if pd.isna(data_1['Last.Name']) | pd.isna(data_2['Last.Name']):
            new_match_data.loc[count, 'Last.Name'] = 0
        else:
            new_match_data.loc[count, 'Last.Name'] = textdistance.ratcliff_obershelp(data_1['Last.Name'], data_2['Last.Name'])
            
        # first name        
        if pd.isna(data_1['First.Name']) | pd.isna(data_2['First.Name']):
            new_match_data.loc[count, 'First.Name'] = 0
        else:
            new_match_data.loc[count, 'First.Name'] = textdistance.ratcliff_obershelp(data_1['First.Name'], data_2['First.Name'])
            
        # calculated birth year
        new_match_data.loc[count, 'CalculatedBirthYear'] = abs(data_1['CalculatedBirthYear'] - data_2['CalculatedBirthYear'])
    
        # state      
        if pd.isna(data_1['State/Province']) | pd.isna(data_2['State/Province']):
            new_match_data.loc[count, 'State/Province'] = 0
        else:
            new_match_data.loc[count, 'State/Province'] = int(data_1['State/Province'] == data_2['State/Province'])
            
        # county        
        if pd.isna(data_1['Count']) | pd.isna(data_2['Count']):
            new_match_data.loc[count, 'Count'] = 0
        else:
            new_match_data.loc[count, 'Count'] = textdistance.ratcliff_obershelp(data_1['Count'], data_2['Count'])
        
        # place
        if pd.isna(data_1['Place']) | pd.isna(data_2['Place']):
            new_match_data.loc[count, 'Place'] = 0
        else:
            new_match_data.loc[count, 'Place'] = textdistance.ratcliff_obershelp(data_1['Place'], data_2['Place'])
        
        # sex      
        if pd.isna(data_1['Sex']) | pd.isna(data_2['Sex']):
            new_match_data.loc[count, 'Sex'] = 0
        else:
            new_match_data.loc[count, 'Sex'] = int(data_1['Sex'] == data_2['Sex']) 
        
        # match
        if data_1['Joint ID for Matched Records'] == data_2['Joint ID for Matched Records']:
            new_match_data.loc[count, 'Match'] = 1
        else:
            new_match_data.loc[count, 'Match'] = 0
           
                                                   
        count = count + 1

In [239]:
new_match_data.tail()

Unnamed: 0,Index1,Index2,Last.Name,First.Name,CalculatedBirthYear,State/Province,Count,Place,Sex,Match
124745,496.0,498.0,0.333333,0.307692,0.0,0.0,1.0,1.0,0.0,0.0
124746,496.0,499.0,0.333333,0.307692,0.0,1.0,0.0,0.333333,0.0,0.0
124747,497.0,498.0,0.333333,0.444444,1.0,1.0,1.0,1.0,1.0,0.0
124748,497.0,499.0,0.333333,0.444444,1.0,0.0,0.0,0.333333,1.0,0.0
124749,498.0,499.0,1.0,1.0,0.0,0.0,0.0,0.333333,1.0,1.0


In [251]:
# split into test and training sets

from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(new_match_data.loc[:,'Last.Name':'Sex'], new_match_data.loc[:, 'Match'], test_size = .2)

In [253]:
# perform logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [290]:
# evaluate model 
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, y_pred)
print('Average precision/recall score: ' + str(average_precision))

Average precision/recall score: 0.8042820524770472


In [287]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(str(confusion_matrix[0][0]) + " true negatives, " + str(confusion_matrix[1][1]) + 
      " true positives, " + str(confusion_matrix[1][0]) + " false negatives, and " + 
      str(confusion_matrix[0][1]) + " false positives.")

24861 true negatives, 72 true positives, 14 false negatives, and 3 false positives.


In [274]:
# now, use same log fit to predict matches from rows that do not have a confirmed match

# process 200 rows of the unmarked data
unmarked_match_data = pd.DataFrame(
    np.nan, index = range(19900), columns = 
    ['Index1', 'Index2', 'Last.Name', 'First.Name', 'CalculatedBirthYear', 'State/Province', 'Count', 'Place', 'Sex', 'Match'])


count = 0 
for i in range(925, 1125):
    for j in range(i + 1, 1125):
        data_1 = data.iloc[i, :]
        data_2 = data.iloc[j, :]
        
        unmarked_match_data.loc[count, 'Index1'] = i
        unmarked_match_data.loc[count, 'Index2'] = j
        
        # last name
        if pd.isna(data_1['Last.Name']) | pd.isna(data_2['Last.Name']):
            unmarked_match_data.loc[count, 'Last.Name'] = 0
        else:
            unmarked_match_data.loc[count, 'Last.Name'] = textdistance.ratcliff_obershelp(data_1['Last.Name'], data_2['Last.Name'])
            
        # first name        
        if pd.isna(data_1['First.Name']) | pd.isna(data_2['First.Name']):
            unmarked_match_data.loc[count, 'First.Name'] = 0
        else:
            unmarked_match_data.loc[count, 'First.Name'] = textdistance.ratcliff_obershelp(data_1['First.Name'], data_2['First.Name'])
            
        # calculated birth year
        unmarked_match_data.loc[count, 'CalculatedBirthYear'] = abs(data_1['CalculatedBirthYear'] - data_2['CalculatedBirthYear'])
    
        # state      
        if pd.isna(data_1['State/Province']) | pd.isna(data_2['State/Province']):
            unmarked_match_data.loc[count, 'State/Province'] = 0
        else:
            unmarked_match_data.loc[count, 'State/Province'] = int(data_1['State/Province'] == data_2['State/Province'])
            
        # county        
        if pd.isna(data_1['Count']) | pd.isna(data_2['Count']):
            unmarked_match_data.loc[count, 'Count'] = 0
        else:
            unmarked_match_data.loc[count, 'Count'] = textdistance.ratcliff_obershelp(data_1['Count'], data_2['Count'])
        
        # place
        if pd.isna(data_1['Place']) | pd.isna(data_2['Place']):
            unmarked_match_data.loc[count, 'Place'] = 0
        else:
            unmarked_match_data.loc[count, 'Place'] = textdistance.ratcliff_obershelp(data_1['Place'], data_2['Place'])
        
        # sex      
        if pd.isna(data_1['Sex']) | pd.isna(data_2['Sex']):
            unmarked_match_data.loc[count, 'Sex'] = 0
        else:
            unmarked_match_data.loc[count, 'Sex'] = int(data_1['Sex'] == data_2['Sex']) 
        
        # match
        if data_1['Joint ID for Matched Records'] == data_2['Joint ID for Matched Records']:
            unmarked_match_data.loc[count, 'Match'] = 1
        else:
            unmarked_match_data.loc[count, 'Match'] = 0
           
                                                   
        count = count + 1

In [275]:
unmarked_match_data.head()

Unnamed: 0,Index1,Index2,Last.Name,First.Name,CalculatedBirthYear,State/Province,Count,Place,Sex,Match
10,925.0,936.0,0.545455,0.0,3.0,0.0,0.307692,0.2,1.0,0.0
11,925.0,937.0,0.545455,0.0,3.0,0.0,0.166667,0.166667,1.0,0.0
12,925.0,938.0,0.027473,0.307692,1.0,0.0,0.0,0.0,1.0,0.0
13,925.0,939.0,0.666667,0.222222,8.0,0.0,0.4,0.181818,1.0,0.0
14,925.0,940.0,0.375,0.222222,65.0,0.0,0.4,0.181818,0.0,0.0


In [276]:
unmarked_match_data_test = unmarked_match_data.loc[:,'Last.Name':'Sex']

In [277]:
unmarked_match_data_test.head()

Unnamed: 0,Last.Name,First.Name,CalculatedBirthYear,State/Province,Count,Place,Sex
0,0.8,0.181818,39.0,0.0,0.266667,0.0,0.0
1,0.545455,0.285714,36.0,0.0,0.4,0.222222,1.0
2,0.545455,0.444444,14.0,0.0,0.4,0.307692,1.0
3,0.545455,0.363636,0.0,0.0,0.0,0.153846,0.0
4,0.545455,0.2,31.0,0.0,0.333333,0.181818,0.0


In [278]:
unmarked_pred = logreg.predict(unmarked_match_data_test)

In [279]:
# extract the data from the pairs marked as matches, in order to visually confirm
index_list = unmarked_match_data_test.index[unmarked_pred == 1].tolist()
visualize_matches = pd.DataFrame(
    np.nan, index = range(len(index_list)), columns = 
    ['Index1', 'Index2','Last.Name1', 'First.Name1', 'State/Province1', 'Count1', 
     'Place1', 'Sex1', 'Last.Name2', 'First.Name2', 'State/Province2', 'Count2', 
     'Place2', 'Sex2',])
count = 0
for index in index_list:
    index_1 = unmarked_match_data.iloc[index, 0]
    index_2 = unmarked_match_data.iloc[index, 1]
    
    visualize_matches.loc[count, 'Index1'] = index_1
    visualize_matches.loc[count, 'Index2'] = index_2
    
    visualize_matches.loc[count, 'Last.Name1':'Sex1'] = [val for val in data.loc[index_1, ['Last.Name', 'First.Name', 'State/Province', 'Count', 
     'Place', 'Sex',]]]
    visualize_matches.loc[count, 'Last.Name2':'Sex2'] = [val for val in data.loc[index_2, ['Last.Name', 'First.Name', 'State/Province', 'Count', 
     'Place', 'Sex',]]]
    
    count = count + 1

In [291]:
visualize_matches.head()

Unnamed: 0,Index1,Index2,Last.Name1,First.Name1,State/Province1,Count1,Place1,Sex1,Last.Name2,First.Name2,State/Province2,Count2,Place2,Sex2
0,936.0,937.0,abbott,wh,ny,columbia,hudson,m,abbott,wmh,ma,suffolk,wdboston,m
1,957.0,971.0,adames,charles,mi,wayne,wddetroit,m,adams,charles,il,cook,wdchicago,m
2,957.0,1062.0,adames,charles,mi,wayne,wddetroit,m,addams,charles,ny,,new york,m
3,962.0,963.0,adams,albert,pa,allegheny,allegheny,m,adams,albert,,,,m
4,962.0,964.0,adams,albert,pa,allegheny,allegheny,m,adams,albert,oh,warren,massietwp,m


In [282]:
visualize_matches.to_csv(r'/Users/seb2244/Desktop/migration_project/output_1_6_20.csv')