This notebook includes some cleaning of the names that I extracted from notes files, matching between files, and matching to the master database. For the actual name extraction, see NER_from_notes.ipynb 

In [11]:
import en_core_web_sm
nlp = en_core_web_sm.load()

from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
st = StanfordNERTagger('/Users/seb2244/Downloads/stanford-ner-4.0.0/classifiers/english.all.3class.distsim.crf.ser.gz',
               '/Users/seb2244/Downloads/stanford-ner-4.0.0/stanford-ner.jar') 

from itertools import groupby

In [3]:
import pandas as pd
import numpy as np
import os
os.chdir('/Users/seb2244/Desktop/migration_project/Research Notes for name (and date,place_) extraction')
import re

In [None]:
# for reading docx files
import docx2txt

In [324]:
# main problems: 
# 1. abbreviations beforehand and afterwards 
# 2. dashes and periods changing how names are demarcated, making them not seem like duplicates
# 3. normal duplicates
# 4. names together with commas like john johnson, jane smith being counted as one name

In [None]:
data = pd.read_excel("ner_8_9_20.xlsx")

In [345]:
data.reset_index(drop = True, inplace = True)

In [346]:
# 4. split up names that are 3 or more names with commas
# (2 or more is hard to do because like "du Bois, William E" could actually be one person)
rows_to_drop = []
for i in range(len(data.index)):
    file_name = data.loc[i, "file_name"]
    name = data.loc[i, "name"]
    if pd.isna(name):
        pass
    elif len(re.findall(",", name)) > 1:
        # drop this row 
        rows_to_drop.append(i)
        # append a row to the df for each new name 
        split_name = name.split(",")
        for new_name in split_name:
            data = data.append({"file_name": file_name, "name": new_name}, ignore_index = True)

In [347]:
data = data.drop(rows_to_drop)

In [308]:
# now, loop through dataframe, get rid of special characters and abbreviations we don't want
# and re-parse the names into first middle and last

In [348]:
data.reset_index(drop = True, inplace = True)

In [401]:
data = pd.read_excel("ner_deduplicated_8_13_20.xlsx")

In [None]:
from nameparser import HumanName

In [397]:
# strip names of periods, dashes, acronyms and other common words before parsing 
for i in range(2500, len(data.index)):
    name = data.loc[i, "name"]
    if pd.isna(name):
        continue
        
    new_name = re.sub("[A-Z]{4,}|NYC|AMH|DTW|DRS|STL|STC|ABT|[Aa]ffadavits|( th)$|( then)$|( rd)$|( nd)$|( widow)$|daughter|ONT|photoed|scanned|papers|Papers|\.|-|(?-i:A[LKSZRAEP]|BC|C[AOT]|D[EC]|F[LM]|G[AU]|HI|I[ADLN]|K[SY]|LA|M[AEHINOPST]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY])$", "", name).strip()
    
    
    # for one word names, this algorithm doesn't really do anything. can't tell if it's a first or last name
    # so I'm just going to leave those columns as blank
    if len(new_name.split()) == 1:
        continue
        
    # use nameparser package to extract rest of names
    name = HumanName(new_name)
    data.loc[i, 'title'] = name.title
    data.loc[i, 'first_name'] = name.first
    data.loc[i, 'middle_name'] = name.middle
    data.loc[i, 'last_name'] = name.last
    data.loc[i, 'suffix'] = name.suffix
    
    # for some reason nameparser translates something like JD Smith to firstname JD last name Smith, 
    # instead of first name J middle name D last name Smith. so here I manually fix that
    if (re.match("[A-Z]{2,3}", name.first)) and not name.middle:
        data.loc[i, 'first_name'] = name.first[0]
        data.loc[i, 'middle_name'] = name.first[1:]

In [None]:
# get rid of abbreviations beforehand and afterward
count = 0
for name in data['name']:
    # if it has at least two capital letters at the end 
    if pd.isna(name):
        pass
    elif re.match("[A-Za-z ]+ [A-Z]{2,}$", name):
        # check to make sure it doesn't end with our prespecified things
        if re.search("SR|JR|PHD|MD", name):
            pass
        # else get rid of it
        else:
            name = re.sub("[ A-Z]{2,}$", "", name)
            data.loc[count, "name"] = name
    count = count + 1

In [None]:
### matching within these files
# within a file: same first and last name where one has a middle name and one doesn't,
# or one has a prefix and one doesn't, or middle initial is the same as middle name
# OR same first name and last initial or vice versa
# OR same first name and last name differ at most by a letter
# OR first name matches and middle name in one is the same as last name in other (like a maiden name)
# note: more lenient because within a file it's a lot more likely that J Smith and John Smith are the same person
# if they're both referred to in a file

# across files: 
# if first initial + full last name, a match can be made if the name is rare 
# or if there's a matching middle initial
# if full first name + last initial, not enough unless we have a middle match 
# if full first and last names, depends on the commonality of the names

In [218]:
# the Broyd, Borderland files are causing problems because they're almost identical
# create a new file name for those that are found in both 

subset1 = data[data['file_name']=="Broyld, Borderland Blacks Rochester and St. Catharines 1850-1860 note-s.txt"]
subset2 = data[data['file_name']=="Broyld, Borderland Blacks Rochester and St. Catharines notes including revisions.docx"]

lst1 = set(subset1['name'])
lst2 = set(subset2['name'])

overlap = list(lst1 & lst2)

for i in range(len(data.index)):
    filename = data.loc[i, 'file_name']
    name = data.loc[i, 'name']
    
    if (filename in ["Broyld, Borderland Blacks Rochester and St. Catharines 1850-1860 note-s.txt", "Broyld, Borderland Blacks Rochester and St. Catharines notes including revisions.docx"]) & (name in overlap):
        data.loc[i, 'file_name'] = "In both Broyld, Borderland Blacks Rochester and St. Catharines 1850-1860 note-s.txt and Broyld, Borderland Blacks Rochester and St. Catharines notes including revisions.docx"

In [220]:
# deduplicate 

# temporarily remove all na rows, since they will match with each other
nonna_subset = data[~data['first_name'].isna()]

# get rid of duplicates that share all parts of the name in the same file 
duplicates = nonna_subset[nonna_subset.duplicated(['file_name', 'first_name', 'middle_name', 'last_name', 'suffix'])]
data = data.drop(duplicates.index)

In [None]:
master = pd.read_excel("current_master_11_4_20.xlsx")

In [15]:
# define common, medium, and rare names based on the master file which has more data
common_first_names = master['First.Name'].value_counts()[master['First.Name'].value_counts() > 60]
medium_first_names = master['First.Name'].value_counts()[(master['First.Name'].value_counts() <= 60) & (master['First.Name'].value_counts() >= 3)]
rare_first_names = master['First.Name'].value_counts()[master['First.Name'].value_counts() < 3]

In [16]:
common_last_names = master['Last.Name'].value_counts()[master['Last.Name'].value_counts() > 100]
medium_last_names = master['Last.Name'].value_counts()[(master['Last.Name'].value_counts() <= 100) & (master['Last.Name'].value_counts() >= 3)]
rare_last_names = master['Last.Name'].value_counts()[master['Last.Name'].value_counts() < 3]

In [256]:
# matching

new_data = pd.DataFrame(columns = ['File Name 1', 'File Name 2', 'Name 1', 'Name 2', 'ID 1', 'ID 2'])


for i in range(len(data.index)):
     
        
    filename = data.loc[i, 'file_name']
    name = data.loc[i, 'name']
    firstname = data.loc[i, 'first_name']
    middlename = data.loc[i, 'middle_name']
    lastname = data.loc[i, 'last_name']
    suffix = data.loc[i, 'suffix']
    id1 = data.loc[i, 'ID #']

    for j in range(i + 1, len(data.index)):

        match = False
        
        filename2 = data.loc[j, 'file_name']
        name2 = data.loc[j, 'name']
        firstname2 = data.loc[j, 'first_name']
        middlename2 = data.loc[j, 'middle_name']
        lastname2 = data.loc[j, 'last_name']
        suffix2 = data.loc[j, 'suffix']
        id2 = data.loc[j, 'ID #']
        
        # if it's blank for a first or last name, that's not enough to make a match
        if pd.isna(lastname) | pd.isna(lastname2) | pd.isna(firstname) | pd.isna(firstname2):
            continue

        
        # last name
        last_distance = textdistance.levenshtein(lastname, lastname2)
        if lastname in common_last_names:
            last_common = "common"
        elif lastname in medium_last_names:
            last_common = "medium"
        else:
            last_common = "rare"

        # first name
        first_distance = textdistance.levenshtein(firstname, firstname2)
        if firstname in common_first_names:
            first_common = "common"
        elif lastname in medium_last_names:
            first_common = "medium"
        else:
            first_common = "rare"
                
        # middle name        
        if pd.isna(middlename) | pd.isna(middlename2):
            middle_distance = 100
        else:
            middle_distance = textdistance.levenshtein(middlename, middlename2)
            
        # rules for within a file name = more lenient, since it's more likely to be about the same person
        if filename == filename2:
            if (len(firstname) == 1) & (len(lastname) == 1):
                pass
            # if first name is an initial, S. Bair or S. B. 
            elif len(firstname) == 1:
                # S. Bair
                if firstname2.startswith(firstname) & ((last_distance<=1) | (lastname.startswith(lastname2))):
                    match = True
                            
            # repeate if last name is an initial, Sophie B. 
            elif len(lastname) == 1:
                if lastname2.startswith(lastname) & ((first_distance<=1) | (firstname.startswith(firstname2))):
                    match = True
            
            # no initials, just similar names
            elif (first_distance <=1) & (last_distance <=1):
                match = True
                # if one has a suffix of jr or senior, then it's not a match
                if (suffix in ["jr, Jr., Jr, JR, jr., JR., sr, Sr., Sr, SR, sr., SR."]) | (suffix2 in ["jr, Jr., Jr, JR, jr., JR., sr, Sr., Sr, SR, sr., SR."]):
                    match = False
                    
            # maiden name check 
            elif first_distance <=1:
                if (middlename == lastname2) | (middlename2 == lastname):
                    match = True
                    
            # Make sure middle names match if they exist
            if (pd.notna(middlename)) & (pd.notna(middlename2)):
                if len(middlename) == 1 | len(middlename2) == 1:
                    if middlename.startswith(middlename2[0]):
                        pass
                    else:
                        match = False
                else:
                    if middle_distance > 1:
                        match = False

                    
            
        # outside a file it's a bit tricker, have to take commonality into effect 
        else:
            # if just initials (S.B.), not enough to make a match
            if (len(firstname) == 1) & (len(lastname) == 1):
                pass
            
            # if first initial + full last name, a match can be made if the name is rare 
            # or if there's a matching middle initial
            elif len(firstname) == 1:
                if (firstname2.startswith(firstname)) & (last_distance == 0):
                    if  last_common == "rare":
                        match = True
                    elif (pd.notna(middlename)) & (pd.notna(middlename2)):
                        if len(middlename) == 1 | len(middlename2) == 1:
                            if middlename.startswith(middlename2[0]):
                                match = True
                        else:
                            if middle_distance == 0:
                                match = True
                            
            # if full first name + last initial, not enough unless we have a middle match 
            elif len(lastname) == 1:
                if (lastname2.startswith(lastname)) & (first_distance == 0):
                    if (pd.notna(middlename)) & (pd.notna(middlename2)):
                        if len(middlename) == 1 | len(middlename2) == 1:
                            if middlename.startswith(middlename2[0]):
                                match = True
                        else:
                            if middle_distance == 0:
                                match = True

            # now, if full names on both    
            elif (first_distance <=1) & (last_distance <=1):
                if (first_common == "rare") | (last_common == "rare"):
                    match = True
                elif (first_common == "common") & (last_common == "common"):
                    # first and last names must be a perfect match
                    if (first_distance == 0) & (last_distance == 0):
                        # if there's middle names, they must be a perfect match 
                        if middle_distance == 0:
                            match = True
                        # if no middle names, number in the file name must match 
                        elif middle_distance == 100:
                            if (len(re.sub("[^0-9]", "", filename)) > 0) & (re.sub("[^0-9]", "", filename) == re.sub("[^0-9]", "", filename2)):
                                match = True
                else:
                    if middle_distance == 0:
                        match = True
                    elif (len(re.sub("[^0-9]", "", filename)) > 0) & (re.sub("[^0-9]", "", filename) == re.sub("[^0-9]", "", filename2)):
                        match = True
                        
        if match:
            new_data = new_data.append({'File Name 1': filename, 'File Name 2': filename2, 'Name 1': name, 'Name 2': name2, 'ID 1': id1, 'ID 2': id2}, ignore_index = True)
            



In [257]:
new_data.head()

Unnamed: 0,File Name 1,File Name 2,Name 1,Name 2,ID 1,ID 2
0,Anderson Ruffin Abbott Papers TRL.docx,Globe Democrat from Guylaine.txt,AR Abbott,A.R. Abbott,51756,55891
1,Anderson Ruffin Abbott Papers TRL.docx,Guylaine Petrin notes.docx,AR Abbott,A.R. Abbott,51756,56094
2,Globe Democrat from Guylaine.txt,Guylaine Petrin notes.docx,A.R. Abbott,A.R. Abbott,55891,56094
3,Globe Democrat from Guylaine.txt,Globe Democrat from Guylaine.txt,A.T. Augusta,"A.T. Augusta, L.M.B.",55892,55893
4,Globe Democrat from Guylaine.txt,Arenson notes on Richard Reid Citizens Sojourn...,A.T. Augusta,Alex T. Augusta,55892,51964
5,Globe Democrat from Guylaine.txt,Arenson notes on Richard Reid Citizens Sojourn...,A.T. Augusta,Alexander T. Augusta,55892,51967
6,Globe Democrat from Guylaine.txt,Black Abolitionists Archive UD Mercy.txt,A.T. Augusta,Alexander T Augusta,55892,52676
7,Globe Democrat from Guylaine.txt,Arenson notes on Richard Reid Citizens Sojourn...,"A.T. Augusta, L.M.B.",Alex T. Augusta,55893,51964
8,Globe Democrat from Guylaine.txt,Arenson notes on Richard Reid Citizens Sojourn...,"A.T. Augusta, L.M.B.",Alexander T. Augusta,55893,51967
9,Globe Democrat from Guylaine.txt,Black Abolitionists Archive UD Mercy.txt,"A.T. Augusta, L.M.B.",Alexander T Augusta,55893,52676


In [258]:
new_data.to_excel("ner_9_4_20_2.xlsx")

In [None]:
#### matching between ner file and master

In [7]:
# First, do name preprocessing for everything because the file is taking forever to run 

def preprocess_data(name):
    name_not_nan = pd.notna(name)
    if name_not_nan:
        if len(name) == 0:
            firstname = np.nan
    return(name)

def preprocess_master(first_name, last_name):
    
    name = str(first_name) + " " + str(last_name)
    myname = HumanName(name)
    firstname2 = myname.first
    middlename2 = myname.middle
    lastname2 = myname.last
    
    if firstname2 == "nan":
        firstname2 = np.nan
    if middlename2 == "nan":
        middlename2 = np.nan
    if lastname2 == "nan":
        lastname2 = np.nan
        
    if pd.notna(firstname2):
        if(len(firstname2) > 2):
            # at beginning: i.e. SElizabeth
            if (re.match("^[A-Z]{2}", firstname2)) and not (re.match("[^[^a-z]*$", firstname2)) and not myname.middle:
                firstname2 = myname.first[0]
                middlename2 = myname.first[1:]
            # at the end: ElizabethJE
            elif firstname2[-1].isupper() and firstname2[-2].isupper()and not (re.match("[^[^a-z]*$", firstname2)) and not myname.middle:
                firstname2 = myname.first[0:-2]
                middlename2 = myname.first[-2:]
            elif firstname2[-1].isupper() and not (re.match("[^[^a-z]*$", firstname2)) and not myname.middle:
                firstname2 = myname.first[0:-1]
                middlename2 = myname.first[-1:]
    
        if re.search("\.",  firstname2):
            for mymatch in re.finditer("\.",  firstname2):
                s = mymatch.start() - 1
                if s == 0:
                    firstname2 = myname.first[0]
                    middlename2 = myname.first[1:]
                else:
                    firstname2 = myname.first[0:s]
                    middlename2 = myname.first[s:]
                break

    firstname2_not_nan = pd.notna(firstname2)
    middlename2_not_nan = pd.notna(middlename2)
    lastname2_not_nan = pd.notna(lastname2)

    if firstname2_not_nan:
        firstname2 = "".join(e for e in firstname2 if e.isalnum())
        firstname2 = firstname2.lower()
        if len(firstname2) == 0:
            firstname2 = np.nan
    if middlename2_not_nan:
        middlename2 = "".join(e for e in middlename2 if e.isalnum())
        middlename2 = middlename2.lower()
        if len(middlename2) == 0:
            middlename2 = np.nan
    if lastname2_not_nan:
        lastname2 = "".join(e for e in lastname2 if e.isalnum()) 
        lastname2 = lastname2.lower()
        if len(lastname2) == 0:
            lastname2 = np.nan
            
    return(firstname2, middlename2, lastname2)

In [8]:
fast_data = data.filter(['unique_id', 'source', 'Last.Name', 'First.Name'])
fast_master = master.filter(['unique_id', 'source', 'Last.Name', 'First.Name'])

In [9]:
fast_data['First.Name'] =  fast_data['First.Name'].apply(preprocess_data)
fast_data['Last.Name'] =  fast_data['Last.Name'].apply(preprocess_data)

In [13]:
#fast_master['first_name', 'middle_name', 'last_name'] = fast_master[['Last.Name', 'First.Name']].apply(preprocess_master, axis = 0)
for i in range(len(fast_master.index)):
    lastname = fast_master.loc[i, 'Last.Name']
    firstname = fast_master.loc[i, 'First.Name']
    result = preprocess_master(firstname, lastname)
    fast_master.loc[i, 'Last.Name'] = result[2]
    #fast_master.loc[i, 'Middle.Name'] = result[1]
    fast_master.loc[i, 'First.Name'] = result[0]

In [14]:
fast_data = fast_data[fast_data['First.Name'].notna()]
fast_data = fast_data[fast_data['Last.Name'].notna()]
fast_data.reset_index(inplace = True, drop = True)

fast_master = fast_master[fast_master['First.Name'].notna()]
fast_master = fast_master[fast_master['Last.Name'].notna()]
fast_master.reset_index(inplace = True, drop = True)

In [16]:
fast_data["First.Name"] = fast_data["First.Name"].str.lower()
fast_data["Last.Name"] = fast_data["Last.Name"].str.lower()

In [27]:
# changing the format: 9/4/20
# updated to make it more lenient: 10/21/20
# changing for victoria files 11/14/20


new_data = pd.DataFrame(columns = ['ID 1', 'ID 2'])


for i in range(len(fast_data.index)):
             
    firstname = fast_data.loc[i, 'First.Name']
    firstname = firstname.split(" ")[0]
    #middlename = fast_data.loc[i, 'middle_name']
    lastname = fast_data.loc[i, 'Last.Name']
    
    #middlename_not_nan = pd.notna(middlename)
    
    if lastname in common_last_names:
        last_common = "common"
    elif lastname in medium_last_names:
        last_common = "medium"
    else:
        last_common = "rare"
        
    if firstname in common_first_names:
        first_common = "common"
    elif lastname in medium_last_names:
        first_common = "medium"
    else:
        first_common = "rare"
    
    for j in range(len(fast_master.index)):

        match = False
        
        firstname2 = fast_master.loc[j, 'First.Name']
        firstname = firstname.split(" ")[0]
        #middlename2 = fast_master.loc[j, 'Middle.Name']
        lastname2 = fast_master.loc[j, 'Last.Name']
            
        # if just initials (S.B.), not enough to make a match
        if (len(firstname) == 1) & (len(lastname) == 1):
            continue
     
        # last name
        last_distance = textdistance.levenshtein(lastname, lastname2)
        
        # first name
        first_distance = textdistance.levenshtein(firstname, firstname2)
                
#         # middle name
#         middlename2_not_nan = pd.notna(middlename2)
#         if middlename_not_nan & middlename2_not_nan:
#             middle_distance = textdistance.levenshtein(middlename, middlename2)
#         else:
#             middle_distance = 100

        # if first initial + full last name, a match can be made if the name isn't common 
        # or if there's a matching middle initial
        if len(firstname) == 1:
            if (firstname2.startswith(firstname)) & (last_distance == 0):
                if last_common != "common":
                    match = True
   
        # if full names for both 
        elif (first_distance <=1) & (last_distance <=1):
            match = True

        if match:
            id1 = fast_data.loc[i, 'unique_id']
            id2 = fast_master.loc[j, 'unique_id']
                
            new_data = new_data.append({'ID 1': id1, 'ID 2': id2}, ignore_index = True)

In [36]:
master = pd.concat([master, data])

In [38]:
final_data = pd.DataFrame(columns = master.columns)
for i in range(len(new_data)):
    id1 = new_data.loc[i, 'ID 1']
    row1 = master[master['unique_id'] == id1]

    id2 = new_data.loc[i, 'ID 2']
    row2 = master[master['unique_id'] == id2]
    final_data = final_data.append(row1)
    final_data = final_data.append(row2)
    final_data = final_data.append(pd.Series(), ignore_index=True)


In [71]:
victoria.to_excel("victoria_11_14_20_3.xlsx")

In [23]:
new_data = pd.read_excel("ner_10_24_20.xlsx")

In [47]:
final_data = pd.DataFrame(columns = ['ID 1', 'ID 2', 'Name 1', 'First Name 2', 'Last Name 2', 'Source 1', 'Source 2', 'Census.Year', 'State/Province', 'County', 
                                     'Place', 'Household Joint ID', 'Joint ID for Matched Records',
                                     'CalculatedBirthYear', 'Age', 'Sex',
                                     'Color..Race.or.Ethnicity', 'Match?'])


for i in range(len(new_data.index)):
    id1 = new_data.loc[i, 'ID 1']
    id2 = new_data.loc[i, 'ID 2']
    
    file1 = data[data['ID #'] == id1]
    file2 = master[master['unique_id'] == id2]
    
    name1 = file1['name'].values[0]
    firstname2 = file2['First.Name'].values[0]
    lastname2 = file2['Last.Name'].values[0]
    
    source1 = file1['file_name'].values[0]
    source2 = file2['source'].values[0]
    
    final_data = final_data.append({'ID 1': id1, 'ID 2': id2, 
                                    'Name 1': name1, 'First Name 2': firstname2, 'Last Name 2': lastname2, 
                                   'Source 1': source1, 'Source 2': source2, 
                                   'Census.Year': file2['Census.Year'].values[0], 
                                    'State/Province': file2['State/Province'].values[0], 
                                    'County': file2['County'].values[0],
                                     'Place': file2['Place'].values[0],
                                    'Household Joint ID': file2['Household Joint ID'].values[0], 
                                    'Joint ID for Matched Records': file2['Joint ID for Matched Records'].values[0],
                                     'CalculatedBirthYear': file2['CalculatedBirthYear'].values[0], 
                                    'Age': file2['Age'].values[0], 'Sex': file2['Sex'].values[0],
                                     'Color..Race.or.Ethnicity': file2['Color..Race.or.Ethnicity'].values[0]}, ignore_index = True)

In [48]:
final_data.head()

Unnamed: 0,ID 1,ID 2,Name 1,First Name 2,Last Name 2,Source 1,Source 2,Census.Year,State/Province,County,Place,Household Joint ID,Joint ID for Matched Records,CalculatedBirthYear,Age,Sex,Color..Race.or.Ethnicity,Match?
0,51756,2491,AR Abbott,Anderson,Abbott,Anderson Ruffin Abbott Papers TRL.docx,USCT,1864,,,,118,1311,1842,22,M,,
1,51756,50952,AR Abbott,Anderson R,Abbott,Anderson Ruffin Abbott Papers TRL.docx,1881 Canada,1881,Ontario,,Toronto,118,1317,1877,4,M,,
2,51756,50937,AR Abbott,Anderson Ruffin,Abbott,Anderson Ruffin Abbott Papers TRL.docx,1861 Canada,1861,Ontario,Toronto St Johns Ward,Toronto,118,1311,1838,23,M,,
3,51756,50943,AR Abbott,Anderson Ruffin,Abbott,Anderson Ruffin Abbott Papers TRL.docx,1871 Canada,1871,Ontario,Toronto St James Ward,Toronto,118,1311,1836,35,M,African,
4,51756,2122,AR Abbott,ANDERSONR,ABBOTT,Anderson Ruffin Abbott Papers TRL.docx,16890,1920,NY,BRONX,BRONX,118,1317,1879,41,M,M,
5,51756,2486,AR Abbott,ARON,ABBOTT,Anderson Ruffin Abbott Papers TRL.docx,519,1860,MI,WAYNE,MONGUAGON,,,1829,31,M,B,
6,55891,2491,A.R. Abbott,Anderson,Abbott,Globe Democrat from Guylaine.txt,USCT,1864,,,,118,1311,1842,22,M,,
7,55891,50952,A.R. Abbott,Anderson R,Abbott,Globe Democrat from Guylaine.txt,1881 Canada,1881,Ontario,,Toronto,118,1317,1877,4,M,,
8,55891,50937,A.R. Abbott,Anderson Ruffin,Abbott,Globe Democrat from Guylaine.txt,1861 Canada,1861,Ontario,Toronto St Johns Ward,Toronto,118,1311,1838,23,M,,
9,55891,50943,A.R. Abbott,Anderson Ruffin,Abbott,Globe Democrat from Guylaine.txt,1871 Canada,1871,Ontario,Toronto St James Ward,Toronto,118,1311,1836,35,M,African,


In [49]:
final_data.to_excel("ner_matches_tocheck_10_29_20.xlsx")

In [41]:
victoria = pd.read_excel("victoria_11_14_20.xlsx")

In [70]:
# for victoria files we have birth year, so here we drop any matches in which the discrepancies in the birth years
# are too large to be feasible (>10 yrs difference)
to_drop = []
for i in range(0, len(victoria), 3):
    year1 = victoria.loc[i, 'CalculatedBirthYear']
    year2 = victoria.loc[i + 1, 'CalculatedBirthYear']
    if pd.notna(year1) & pd.notna(year2):
        if isinstance(year1, int) & isinstance(year2, int):
            if abs(int(year1) - int(year2)) > 10:
                to_drop.append(i)
                to_drop.append(i + 1)
                to_drop.append(i + 2)
victoria = victoria.drop(to_drop)