# Importing required libraries

In [None]:
from pymed import PubMed
from string import punctuation
import copy
import datetime
import pandas as pd
from unidecode import unidecode
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import csv

# PubMed Extraction

### Cell to limit the number of articles extracted from the results

In [None]:
MaxResults= 2000

### Main code for extraction

In [1]:

SearchTerms = 'Acute Myeloid Leukemia'



pubmed = PubMed(tool="MyTool", email="saliugiwaosagie@gmail.com")

results = pubmed.query(SearchTerms, max_results=MaxResults)



results_dict = {}

count = 0
for article in results:
    count +=1
    results_dict.update({article.pubmed_id: {}})
    results_dict[article.pubmed_id]['PMID'] = article.pubmed_id[:9].strip()
    results_dict[article.pubmed_id]['title'] = article.title
    #results_dict[article.pubmed_id]['abstract'] = article.abstract
    results_dict[article.pubmed_id]['authors'] = article.authors
    
author_extraction = []
extraction_publication_dict = {}

prefix = 'extract_'

x = 0

for article in results_dict:
    extraction_publication_dict[results_dict[article]['PMID']] = results_dict[article]['title']
    
    for author in results_dict[article]['authors']:
        if author['lastname'] != None:
            x +=1
            current = []
            first_name_split = None

            current.append(prefix + f'{x:05}')
            if author['firstname'] != None:
                first_name_split = author['firstname'].split(' ')
                if len(first_name_split) == 1:
                    current.append(first_name_split[0])
                    current.append('')
                else:
                    current.append(first_name_split[0])
                    current.append(' '.join(first_name_split[1:]))
            else:
                current.append('')
                current.append('')
            current.append(author['lastname'])
            if author['firstname'] != None:
                current.append(' '.join([author['firstname'], author['lastname']]))
            else:
                current.append(author['lastname'])
            if 'affiliation' in author and author['affiliation'] != None:
                current.append(author['affiliation'])
            else:
                current.append('')
            current.append(results_dict[article]['PMID'][:9].strip())
            author_extraction.append(current)

print(f'Finished extraction. {count} articles. {x} records') 

Finished extraction my g. 2000 articles. 18375 records


### PubMed extraction export

In [303]:
results_list = [['ID', 'First Name', 'Middle Name', 'Last Name', 'Full Name', 'Affiliation', 'Activity ID' ]] + author_extraction
with open('Python Pubmed Extraction.csv','w', newline='', encoding = "UTF-8") as dunno:
    writer = csv.writer(dunno)
    writer.writerows(results_list)

# Defining functions used in tests

### Test for initials

In [3]:
def initial_tester(name_1, name_2):
    
    #Setting variables
    match = False
    first_single_letters = False
    middle_single_letters = False
    first_present = False
    middle_present = False
    short_name = None
    long_name = None
    
    first_length_1 = len(name_1[0])
    first_length_2 = len(name_2[0])
    
    middle_length_1 = len(name_1[1])
    middle_length_2 = len(name_2[1])
    
       
    last_name_1 = name_1[2]
    last_name_2 = name_2[2]
    
    two_letter_and_hyphen = False
    
    #Test to see if first name with hyphens could be the same. Common in Korean names. e.g.,  s-w vs see-woo
    if '-' in name_1[0] and '-' in name_2[0]:
        if len(name_1[0]) == 3 and len(name_2[0]) > 3:
            short_name = name_1
            long_name = name_2
        elif len(name_2[0]) == 3 and len(name_1[0]) > 3:
            short_name = name_2
            long_name = name_1
        
            
    if (short_name != None
        and short_name[0][0] == long_name[0][0]
        and '-' + str(short_name[0][2]) in long_name[0]
        and name_1[2] == name_2[2]):
        
        match = True
        
   #Test to see if two letter initals have been used in place of first name with hypen. e.g, JP  and Jean-Paul
    if '-' in name_1[0] and '-' not in name_2[0] and len(name_2[0]) == 2:
        hyphenated_first_name = name_1[0]
        non_hyphenated_first_name = name_2[0]
        two_letter_and_hyphen = True
    elif '-' in name_2[0] and '-' not in name_1[0] and len(name_1[0]) ==2:
        hyphenated_first_name = name_2[0]
        non_hyphenated_first_name = name_1[0]
        two_letter_and_hyphen = True
        
    if two_letter_and_hyphen == True:
        if (non_hyphenated_first_name[0] == hyphenated_first_name[0]
            and '-' + str(non_hyphenated_first_name[1]) in hyphenated_first_name
            and name_1[2] == name_2[2]):
            match = True


    #Test to check if initials between names are the same
    if (first_length_1 > 0
        and first_length_2 > 0):
        first_present = True
        first_initial_1 = name_1[0][0]
        first_initial_2 = name_2[0][0]
    
    if (middle_length_1 > 0
        and
        middle_length_2 > 0):
        middle_present = True
        middle_initial_1 = name_1[1][0]
        middle_initial_2 = name_2[1][0]  
        if (middle_length_1 == 1
            or
            middle_length_2 == 1):
            middle_single_letters = True  
    
    if (first_length_1 <= 1
        or
        first_length_2 <= 1):
        first_single_letters = True
        
    if first_present == True:
        if middle_present == True:
            if first_single_letters == True and middle_single_letters == True:
                if (first_initial_1 == first_initial_2
                    and
                    middle_initial_1 == middle_initial_2
                    and
                    last_name_1 == last_name_2):
                    match = True
        else:
            if first_single_letters == True:
                if (first_initial_1 == first_initial_2
                    and
                    last_name_1 == last_name_2):            
                    match = True
        
    
    return match

### Test for hyhenated names

In [4]:
def hyphen_tester(name_1, name_2):
    prename_there = False
    other_names_there = False
    other_name_part = []
    concat_name_1 = ' '.join(name_1).strip()
    concat_name_2 = ' '.join(name_2).strip()
    if ('-' in concat_name_1 and '-' in concat_name_2) or ('-' not in concat_name_1 and '-' not in concat_name_2):
        return False
    else:        
        if '-' in concat_name_1:
            hyphenated_name = name_1
            non_hyphenated_name = name_2
        elif '-' in concat_name_2:
            hyphenated_name = name_2
            non_hyphenated_name = name_1
        
        position = -1
        for name in hyphenated_name:
            position +=1
            if '-' in name:
                hyphenated_name_part = name
                final_position = position
            else:
                other_name_part.append(name)
            
        prename = hyphenated_name_part[:hyphenated_name_part.find('-')]
        
        position_2 = -1
        for name in non_hyphenated_name:
            position_2 +=1
            if prename == name:
                prename_there = True
                final_position_2 = position_2
        
        other_name_position_1 = 0
        other_name_position_2 = 0
        for name in other_name_part:
            other_name_position_1 += 1
            for name2 in non_hyphenated_name:
                other_name_position_2 += 1
                if name == name2 and name != '' and len(name) > 1 and other_name_position_1 == other_name_position_2:
                    other_names_there = True
            
    
        if other_names_there == True and prename_there == True and final_position == final_position_2 and len(prename) > 1:
            return True
        else:
            return False
        
        
        

### Function for testing common words between affilaitions

In [7]:
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", 'the', "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

additional_stopwords = ['university', 'department']

stopwords.extend(additional_stopwords)

stopwords = sorted(stopwords)


def custom_affiliation_tester(affiliation_1, affiliation_2):
    if affiliation_1 == '' or affiliation_2 == '':
        return 0
    else:
        affiliation_1 = affiliation_1.lower()
        affiliation_2 = affiliation_2.lower()

        processed = []

        for aff in [affiliation_1,  affiliation_2]:
            yo = copy.deepcopy(aff)
            for punc in punctuation:
                if punc in aff:
                    yo = yo.replace(punc, "")
            processed.append(yo)

        affiliation_1 = processed[0]
        affiliation_2 = processed[1]



        affiliation_1 = sorted(list(set(affiliation_1.split())))
        affiliation_2 = sorted(list(set(affiliation_2.split())))


        processed2 = []

        for aff in [affiliation_1,  affiliation_2]:
            yo = copy.deepcopy(aff)
            for word in aff:
                for stopword in stopwords:
                    if stopword == word:
                        yo.remove(word)
            processed2.append(yo)

        affiliation_1 = processed2[0]
        affiliation_2 = processed2[1]

        affiliation_1 = sorted(list(set(affiliation_1)))
        affiliation_2 = sorted(list(set(affiliation_2)))


        if len(affiliation_1) <= len(affiliation_2):
            shorter_affiliation = affiliation_1
            longer_affiliation = affiliation_2
        else:
            shorter_affiliation = affiliation_2
            longer_affiliation = affiliation_1

        count = 0
        for word in shorter_affiliation:
            for word2 in longer_affiliation:
                if word == word2:
                    count += 1

        final_count = count
        
        if len(shorter_affiliation) != 0:
            final_percentage = count/len(shorter_affiliation)*100
        else:
            final_percentage = 0
        

        return final_percentage


# Setting up data

### Setting variables for indexes

In [2]:
ID_index = 0
first_name_index = 1
middle_name_index = 2
last_name_index = 3
full_name_index = 4
affiliation_index = 5
activity_index = 6
name_uniqueness_index = 7
first_name_uniqueness_index = 8
new_id_index = 9

### Limit how many names are used in the test

In [None]:
start = 0
stretch = 5000
limit = start + stretch

### Data Source 1 - PubMed extraction

#### Run this cell if data to be used is from the above PubMed extraction. Ignore the 'Data Source 2 - External CSV' cell

In [None]:
author_extraction = [x[:7] for x in author_extraction]
if len(author_extraction) < limit:
    names4 = author_extraction
else:
    names4 = author_extraction[start:limit]
    
    
publication_dict = extraction_publication_dict

### Data Source 2 - External CSVs

#### Run this cell if data to be used is from the 'Names.csv' and 'Publication Titles.csv' files. Ignore the 'Data Source 1 - PubMed extraction' cell

In [None]:
with open('Names.csv', encoding = "ISO-8859-1") as f:
    reader = csv.reader(f)
    names3 = list(reader)
    
if len(names3) < limit:
    names4 = names3
else:
    names4 = names3[start:limit]
    
    
with open('Publication Titles.csv', encoding = "utf-8-sig") as f:
    reader = csv.reader(f)
    publication_list = list(reader)
    
    
publication_dict = {}
for x in publication_list:
    publication_dict[x[0]] = x[1]

# Standardisation of author records

### Tests

In [98]:
%%time


#Preprocess names - set to lowercase and remove special characters and accents etc.
for x in names4:
    x[full_name_index] = unidecode(x[full_name_index].lower())
    x[first_name_index] = unidecode(x[first_name_index].lower())
    x[middle_name_index] = unidecode(x[middle_name_index].lower())
    x[last_name_index] = unidecode(x[last_name_index].lower())

#Set name uniqueness
first_last_names4 = [(x[first_name_index], x[last_name_index]) for x in names4 if len(x[first_name_index]) > 1]
first_last_names4 = set(first_last_names4)
first_last_names4 = list(first_last_names4)


for x in names4:
    if x[last_name_index] in [y[1] for y in first_last_names4]:
        x.append([y[1] for y in first_last_names4].count(x[last_name_index])** -1)
    else:
        x.append(1.0)
    
for x in names4:
    if x[first_name_index] in [y[0] for y in first_last_names4]:
        x.append([y[0] for y in first_last_names4].count(x[first_name_index])** -1)
    else:
        x.append(1.0)
    
print('Finished preprocessing...', datetime.datetime.now().time())
#Populate dictionary    
IDs = list(range(len(names4)))
tuple_records = list(zip(IDs, names4))

print('Data set up...', datetime.datetime.now().time())

#######Cosine similarity calculations etc
all_names = [x[1][full_name_index] for x in tuple_records]
all_names_labels = [(y, x[1][full_name_index]) for (y, x) in enumerate(tuple_records)]

yup = TfidfVectorizer(analyzer='char', ngram_range=(2,2))

yup.fit(all_names)

print('Vectorizer fit...', datetime.datetime.now().time())

cosine_similarities = cosine_similarity(yup.fit_transform(all_names))

print('Cosine similarities calculated...', datetime.datetime.now().time())


cos_threshold = 0.3


yikes = np.nonzero(cosine_similarities>cos_threshold)

print('Calculating nonzeros...', datetime.datetime.now().time())

cosine_list = [(i, j) for i, j in zip(yikes[0], yikes[1])]

print('Cosine list made...', datetime.datetime.now().time())


#####Setting affiliation cosine similarities

all_affiliations = [x[1][affiliation_index] for x in tuple_records]


affiliation_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english')
affiliation_vectorizer.fit(all_affiliations)
print('Affiliation vectorizer fit...', datetime.datetime.now().time())

affiliation_cosine_similarities = cosine_similarity(affiliation_vectorizer.fit_transform(all_affiliations))
print('Affiliation cosine similarities calculated...', datetime.datetime.now().time())

affiliation_nonzeroes = np.nonzero(affiliation_cosine_similarities>0.7)
print('Calculating affiliation nonzeroes...', datetime.datetime.now().time())

affiliation_cosine_list = [(i, j) for i, j in zip(affiliation_nonzeroes[0], affiliation_nonzeroes[1])]
print('Affiliation cosine list made...', datetime.datetime.now().time())

### And again for a second vectorizer

affiliation_vectorizer2 = TfidfVectorizer(analyzer='char', ngram_range=(2, 10))
affiliation_vectorizer2.fit(all_affiliations)
print('Affiliation vectorizer 2 fit...', datetime.datetime.now().time())

affiliation_cosine_similarities2 = cosine_similarity(affiliation_vectorizer2.fit_transform(all_affiliations))
print('Affiliation cosine similarities 2 calculated...', datetime.datetime.now().time())

affiliation_nonzeroes2 = np.nonzero(affiliation_cosine_similarities2>0.7)
print('Calculating affiliation 2 nonzeroes...', datetime.datetime.now().time())

affiliation_cosine_list2 = [(i, j) for i, j in zip(affiliation_nonzeroes2[0], affiliation_nonzeroes2[1])]
print('Affiliation cosine list 2 made...', datetime.datetime.now().time())




######Below are all the tests etc

#Thresholds and variables
matches = []
match = []
predicted = {}
actual = {}
name_match_record = {}


affiliation_threshold_1 = 0.1
affiliation_threshold_2 = 0.25

print()


################# The tests after filtering for cosine simialrities above the threshold####
progress = 0
current_i = None
for record in cosine_list:

    i = record[0]
    j = record[1]
    if i != current_i:
        progress += 1
        if (progress/500) == int(progress/500):
            print(f'Done {progress}...', datetime.datetime.now().time())
    current_i = i
    this_match = False
    reason = None
    name_match_reason = None
    affiliation_score = None
    affiliation_score_2 = None
    both_affiliations_present = False
    name_score = None
    space_dash = True
    first_name_score = None
    last_name_score = None
    network_reason = None
    single_letter_initials = True
    name_match_count = 0
    aff_test = False
    additional_name_1 = None
    additional_name_2 = None
    additional_name_score = None
    last_name_used = False
    if (i != j
        and (
            (tuple_records[i][1][activity_index] != tuple_records[j][1][activity_index])
            or (tuple_records[i][1][activity_index] == tuple_records[j][1][activity_index] and (tuple_records[i][1][first_name_index], tuple_records[i][1][last_name_index]) == (tuple_records[j][1][first_name_index], tuple_records[j][1][last_name_index]))
        )):
        predicted.update({(i, j): [False, reason]})
        name_match_record.update({(i, j): [False, name_match_reason]})
        if tuple_records[i][1][ID_index] == tuple_records[j][1][ID_index]:
            actual.update({(i, j): True})
            actual_value = True
        else:
            actual.update({(i, j): False})
            actual_value = False

        ### Preliminary tests to set some variables used in future tests
        if tuple_records[i][1][affiliation_index] != '' and tuple_records[j][1][affiliation_index] != '':
            both_affiliations_present = True

        if len(tuple_records[i][1][first_name_index]) > 1 and len(tuple_records[j][1][first_name_index]) > 1:
            single_letter_initials = False

        if additional_name_1 == tuple_records[i][1][last_name_index] and additional_name_2 == tuple_records[j][1][last_name_index]:
            if (' ' not in additional_name_1
                and '-' not in additional_name_1
                and ' ' not in additional_name_2
                and '-' not in additional_name_2):
                space_dash = False
        
        #Setting affiliation cosine similarity score        
        affiliation_score_1 = affiliation_cosine_similarities[i, j]
        affiliation_score_2 = affiliation_cosine_similarities2[i, j]


        #Exact Match
        if this_match == False and single_letter_initials == False:
            if tuple_records[i][1][full_name_index] == tuple_records[j][1][full_name_index]:
                this_match = True
                reason = 'exact_match'
                network_reason = 'exact_match'

        #First name and Last Name match but different middle name (middle names either completely different or one is missing)
        if this_match == False and single_letter_initials == False and reason == None and aff_test == False:
            if (tuple_records[i][1][first_name_index], tuple_records[i][1][last_name_index]) == (tuple_records[j][1][first_name_index], tuple_records[j][1][last_name_index]):
                if tuple_records[i][1][middle_name_index] == '' or tuple_records[j][1][middle_name_index] == '':
                    if tuple_records[i][1][name_uniqueness_index] <= 1/5 or tuple_records[j][1][name_uniqueness_index] <= 1/5:
                        aff_test = True #Because names are common
                        reason = 'FN_LN_match'
                        network_reason = 'FN_LN_match'                        
                    else:
                        this_match = True
                        reason = 'FN_LN_match'
                        network_reason = 'FN_LN_match'
                else: #both have a middle name
                    if len(tuple_records[i][1][middle_name_index]) > 1 and len(tuple_records[j][1][middle_name_index]) > 1:
                        aff_test = True #because middle names are different
                        reason = 'FN_LN_match'
                        network_reason = 'FN_LN_match'
                    else:#only one has multiple letters
                        if tuple_records[i][1][middle_name_index][0] == tuple_records[j][1][middle_name_index][0]:
                            if tuple_records[i][1][name_uniqueness_index] <= 1/5 or tuple_records[j][1][name_uniqueness_index] <= 1/5:
                                aff_test = True #because middle names are different
                                reason = 'FN_LN_match'
                                network_reason = 'FN_LN_match'  
                            else:
                                this_match = True #only one has multiple letters but first letter of middle name is the same
                                reason = 'FN_LN_match'
                                network_reason = 'FN_LN_match'
                             
                                

        #First and last names match but reversed
        if this_match == False and single_letter_initials == False and reason == None and aff_test == False:
            if tuple_records[i][1][name_uniqueness_index] <= 1/5 or tuple_records[j][1][name_uniqueness_index] <= 1/5:
                if ((tuple_records[i][1][first_name_index], tuple_records[i][1][last_name_index]) == (tuple_records[j][1][last_name_index], tuple_records[j][1][first_name_index])):
                    aff_test = True
                    reason = 'FN_LN_reversed'
                    network_reason = 'FN_LN_reversed'                
            else:
                if ((tuple_records[i][1][first_name_index], tuple_records[i][1][last_name_index]) == (tuple_records[j][1][last_name_index], tuple_records[j][1][first_name_index])):
                    this_match = True
                    reason = 'FN_LN_reversed'
                    network_reason = 'FN_LN_reversed'


        #############Fuzzy check now#############
        if this_match == False and reason == None and aff_test == False:
            name_score = fuzz.token_sort_ratio(tuple_records[i][1][full_name_index], tuple_records[j][1][full_name_index])
            first_name_score = fuzz.partial_ratio(tuple_records[i][1][first_name_index], tuple_records[j][1][first_name_index])
            last_name_score = fuzz.partial_ratio(tuple_records[i][1][last_name_index],tuple_records[j][1][last_name_index])
            first_name_score_whole = fuzz.ratio(tuple_records[i][1][first_name_index], tuple_records[j][1][first_name_index])
            last_name_score_whole = fuzz.ratio(tuple_records[i][1][last_name_index],tuple_records[j][1][last_name_index])



        #Names jumbled
        if this_match == False and reason == None and aff_test == False:
            if name_score == 100:
                this_match = True
                reason = 'names_jumbled'
                network_reason = 'names_jumbled'


        #Fuzzy
        if this_match == False and single_letter_initials == False and reason == None and aff_test == False:
            if name_score >= 95 or (first_name_score_whole >= 85 and last_name_score_whole >= 85):
                aff_test = True
                reason = 'fuzzy_level_1'
                network_reason = 'fuzzy_level_1'
            elif name_score >= 85:
                if (first_name_score + last_name_score >= 170
                    and first_name_score >= 70
                    and last_name_score >= 70):
                    reason = 'fuzzy_level_2'
                    network_reason = 'fuzzy_level_2'
            elif name_score >= 70:
                if (first_name_score + last_name_score >= 150
                    and first_name_score >= 50
                    and last_name_score >= 50):
                    reason = 'fuzzy_level_3'



        #Initials
        if this_match == False and reason == None and aff_test == False:
            if initial_tester(tuple_records[i][1][first_name_index:last_name_index + 1],
                              tuple_records[j][1][first_name_index:last_name_index + 1]) == True:
                if tuple_records[i][1][name_uniqueness_index] >= 0.5:
                    this_match = True
                    reason = 'initials'
                    network_reason = 'initials'
                else:
                    aff_test = True
                    network_reason = 'initials'
                    reason = 'initials'

        #Hyphen in name
        if this_match == False and reason == None and aff_test == False:
            if hyphen_tester(tuple_records[i][1][first_name_index:last_name_index + 1],
                              tuple_records[j][1][first_name_index:last_name_index + 1]) == True:
                aff_test = True
                network_reason = 'hyphen'
                reason = 'hyphen'


        #Some names appear
        name_match_count = 0
        additional_name_score = None
        if this_match == False and single_letter_initials == False and aff_test == False:
            for name in tuple_records[i][1][first_name_index:last_name_index + 1]:
                for name2 in tuple_records[j][1][first_name_index:last_name_index + 1]:
                    if name == name2 and name != '' and len(name) > 1:
                        name_match_count += 1 
            if name_match_count >= 2:
                reason = 'some_names'
                network_reason = 'some_names'
                for add_name in tuple_records[i][1][first_name_index:last_name_index + 1]:
                    if add_name not in tuple_records[j][1][first_name_index:last_name_index + 1]:
                        additional_name_1 = add_name
                for add_name2 in tuple_records[j][1][first_name_index:last_name_index + 1]:
                    if add_name2 not in tuple_records[i][1][first_name_index:last_name_index + 1]:
                        additional_name_2 = add_name2

                additional_name_score = fuzz.partial_ratio(additional_name_1, additional_name_2)

                if additional_name_score == 100 and both_affiliations_present == False:
                    this_match = True
                elif additional_name_score > 80:
                    aff_test = True
                elif additional_name_1 == '' or additional_name_2 == '':
                    aff_test = True



        #Names contained
        if this_match == False and aff_test == False:
            if space_dash == True:
                if (first_name_score == 100 and last_name_score == 100
                    and len(tuple_records[i][1][first_name_index]) >3
                    and len(tuple_records[j][1][first_name_index]) >3
                    and len(tuple_records[i][1][last_name_index])  >3
                    and len(tuple_records[j][1][last_name_index])  >3):
                    aff_test = True
                    reason = 'names_contained'
                    network_reason = 'names_contained'
            else:
                if (first_name_score == 100 and tuple_records[i][1][last_name_index] == tuple_records[j][1][last_name_index]
                    and len(tuple_records[i][1][first_name_index]) >3
                    and len(tuple_records[j][1][first_name_index]) >3
                    and len(tuple_records[i][1][last_name_index])  >3
                    and len(tuple_records[j][1][last_name_index])  >3):
                    aff_test = True
                    reason = 'names_contained'
                    network_reason = 'names_contained'
                



        #Affiliation tests and final decisions now
        if this_match == False and aff_test == True and both_affiliations_present == True:



            if reason == 'FN_LN_match':
                if affiliation_score_1 >= affiliation_threshold_1 or affiliation_score_2 >= affiliation_threshold_2: this_match = True
                    
            if reason == 'FN_LN_reversed':
                if affiliation_score_1 >= affiliation_threshold_1 or affiliation_score_2 >= affiliation_threshold_2: this_match = True

            if reason == 'fuzzy_level_1':
                if affiliation_score_1 >= affiliation_threshold_1 or affiliation_score_2 >= affiliation_threshold_2: this_match = True

            elif reason == 'initials':
                if affiliation_score_1 >= affiliation_threshold_1 or affiliation_score_2 >= affiliation_threshold_2: this_match = True

            elif reason == 'hyphen':
                if affiliation_score_1 >= affiliation_threshold_1 or affiliation_score_2 >= affiliation_threshold_2: this_match = True

            elif reason == 'one_name':
                if affiliation_score_1 >= affiliation_threshold_1 or affiliation_score_2 >= affiliation_threshold_2: this_match = True

            elif reason == 'some_names':
                if affiliation_score_1 >= affiliation_threshold_1 or affiliation_score_2 >= affiliation_threshold_2: this_match = True

            elif reason == 'names_contained':
                if affiliation_score_1 >= affiliation_threshold_1 or affiliation_score_2 >= affiliation_threshold_2: this_match = True
                    
        
        if (this_match == True
            and
            (reason == 'initials' or (reason == 'exact_match' and tuple_records[i][1][name_uniqueness_index] <1/2))
            and
            affiliation_score_1 <= 0.1
            and
            affiliation_score_2 <= 0.1
            and
            both_affiliations_present == True):
            affiliation_score_3 = custom_affiliation_tester(tuple_records[i][1][affiliation_index], 
                                                           tuple_records[j][1][affiliation_index])
            if reason == 'initals':
                if affiliation_score_3 < 16.6:
                    this_match = False
            elif reason == 'exact_match':
                this_match = False
            





        if this_match == True:
            predicted.update({(i, j): [True, reason]})
            match = [tuple_records[i][0], tuple_records[j][0]]
            match.sort()
            match.append(name_score)
            match.append(affiliation_score)
            match.append(actual_value)
            match.append(first_name_score)
            match.append(last_name_score)
            if matches == [] or match not in matches:
                matches.append(match)
        else:
            predicted.update({(i, j): [False, reason]})

        if network_reason != None:
            name_match_record.update({(i, j): [True, network_reason]})
                

            
            
#_________________End of match assessment - the below is just for recording the results.__________________________________

correct_count = 0
for key in actual:
    if actual[key] == predicted[key][0]:
        correct_count = correct_count + 1
        
overall_accuracy = correct_count / len(actual)
   
    
true_positives = 0
for key in actual:
    if actual[key] == predicted[key][0] and actual[key] == True:
        true_positives = true_positives + 1
        
true_total = sum(value == True for value in actual.values())
if true_total > 0:
    true_positive_percentage = true_positives / true_total
else:
    true_positive_percentage = None


false_positives = 0
for key in actual:
    if actual[key] != predicted[key][0] and actual[key] == False:
        false_positives = false_positives + 1     

false_total = sum(value == False for value in actual.values())
if false_total > 0:
    false_positive_percentage = false_positives / false_total
else:
    false_positive_percentage = None
    
    

false_negatives = 0
for key in actual:
    if actual[key] != predicted[key][0] and actual[key] == True:
        false_negatives = false_negatives + 1     

if true_total > 0:
    false_negative_percentage = false_negatives / true_total
else:
    false_negative_percentage = None



print()
print('{} matches from {} records. {} comparisons'.format(len(matches), len(names4), len(actual)), '\n')

# print('Accuracy of {}'.format(overall_accuracy), '\n')
# print('True positives = {}. True positive percentage = {}'.format(true_positives, true_positive_percentage), '\n')
# print('False positives = {}. False positive percentage = {}'.format(false_positives, false_positive_percentage), '\n')
# print('False negatives = {}. False negative percentage = {}'.format(false_negatives, false_negative_percentage), '\n')

12:59:03.320473
Finished preprocessing... 12:59:14.625448
Data set up... 12:59:14.637414
Vectorizer fit... 12:59:14.769063
Cosine similarities calculated... 12:59:15.663816
Calculating nonzeros... 12:59:15.802438
Cosine list made... 12:59:15.904166
Affiliation vectorizer fit... 12:59:16.638535
Affiliation cosine similarities calculated... 12:59:17.732133
Calculating affiliation nonzeroes... 12:59:17.852811
Affiliation cosine list made... 12:59:17.876748
Affiliation vectorizer 2 fit... 12:59:25.487163
Affiliation cosine similarities 2 calculated... 12:59:44.605721
Calculating affiliation 2 nonzeroes... 12:59:44.717421
Affiliation cosine list 2 made... 12:59:44.743351

Done 500... 12:59:47.139947
Done 1000... 12:59:50.300506
Done 1500... 12:59:53.194688
Done 2000... 12:59:55.243243
Done 2500... 12:59:57.460284
Done 3000... 13:00:00.344602
Done 3500... 13:00:02.636447
Done 4000... 13:00:05.160729
Done 4500... 13:00:07.078571
Done 5000... 13:00:09.136773

923 matches from 5000 records. 244

### Standardisation results export

In [408]:

false_positives_export_list = (
                                [['ID 1', 'First Name 1', 'Middle Name 1', 'Last Name 1', 'Affiliation 1', 'Name Uniqueness 1',
                               'ID 2', 'First Name 2', 'Middle Name 2', 'Last Name 2', 'Affiliation 2', 'Name Uniqueness 2',
                               'Reason', 'Affiliation Cosine Similarity', 'Match Status']]
                              )
for key in actual:
    if (actual[key] != predicted[key][0]
        and actual[key] == False):
        false_positives_export_list.append([tuple_records[key[0]][1][0],
                                            tuple_records[key[0]][1][1],
                                            tuple_records[key[0]][1][2],
                                            tuple_records[key[0]][1][3],
                                            tuple_records[key[0]][1][5],
                                            tuple_records[key[0]][1][7],
                                            tuple_records[key[1]][1][0],
                                            tuple_records[key[1]][1][1],
                                            tuple_records[key[1]][1][2],
                                            tuple_records[key[1]][1][3],
                                            tuple_records[key[1]][1][5],
                                            tuple_records[key[1]][1][7],
                                            predicted[key][1],
                                            affiliation_cosine_similarities[key[0], key[1]],
                                            'Match'
                                           ])


for key in actual:
    if (actual[key] == predicted[key][0]
        and actual[key] == False
        and predicted[key][1] != None):
        false_positives_export_list.append([tuple_records[key[0]][1][0],
                                            tuple_records[key[0]][1][1],
                                            tuple_records[key[0]][1][2],
                                            tuple_records[key[0]][1][3],
                                            tuple_records[key[0]][1][5],
                                            tuple_records[key[0]][1][7],
                                            tuple_records[key[1]][1][0],
                                            tuple_records[key[1]][1][1],
                                            tuple_records[key[1]][1][2],
                                            tuple_records[key[1]][1][3],
                                            tuple_records[key[1]][1][5],
                                            tuple_records[key[1]][1][7],
                                            predicted[key][1],
                                            affiliation_cosine_similarities[key[0], key[1]],
                                            'Near Match'
                                           ])


with open('Matching Export.csv','w', newline='', encoding = "UTF-8") as dunno:
    writer = csv.writer(dunno)
    writer.writerows(false_positives_export_list)

# Unification

### Unification of Author IDs

In [108]:
%%time
import copy
person_records = copy.deepcopy(tuple_records)

prefix = 'test_'

IDs_present = True

if IDs_present == False:
    for i in person_records:
        i[1][0] = prefix + i[1][0] 


for key in predicted:
    if predicted[key][0] == True:
        if len(person_records[key[0]][1]) == 9:
            person_records[key[0]][1].append(None)
        if len(person_records[key[1]][1]) == 9:
            person_records[key[1]][1].append(None)

for key in predicted:
    if predicted[key][0] == True:
        if person_records[key[0]][1][9] == None: #or person_records[key[0]][1][9] == person_records[key[0]][1][0]:
            
            if person_records[key[1]][1][9] == None:            
                person_records[key[0]][1][9] = person_records[key[0]][1][0]
                person_records[key[1]][1][9] = person_records[key[0]][1][0]
            else:
                person_records[key[0]][1][9] = person_records[key[1]][1][9]
        elif person_records[key[0]][1][9] != None and person_records[key[1]][1][9] != None:
            person_records[key[0]][1][9] = person_records[key[1]][1][9]
            for person in person_records:
                if len(person[1]) == 10 and person[1][9] == person_records[key[0]][1][9]:
                    person[1][9] = person_records[key[0]][1][9]
        else:
            person_records[key[1]][1][9] = person_records[key[0]][1][9]
            
for x in person_records:
    if len(x[1]) == 9:
        x[1].append(x[1][0])
            
person_records2 = copy.deepcopy(person_records)


Wall time: 5.73 s


### All IDs in the dataset and final IDs export

In [308]:
results_list = [['Original ID', 'First Name', 'Middle Name', 'Last Name', 'Full Name', 'Affiliation', 'Activity ID', 'Name Uniqueness', 'First Name Uniqueness', 'New ID']] + [x[1] for x in person_records]

with open('Names_results_Stage_1_visit.csv','w', newline='', encoding = "ISO-8859-1") as dunno:
    writer = csv.writer(dunno)
    writer.writerows(results_list)

### Attributing publications to Author IDs after unifying

In [110]:
activities_dict = {}
for i in person_records:
    current_activity = i[1][6]
    if current_activity not in activities_dict:
        activities_dict.update({current_activity: []})
        
    activities_dict[current_activity].append(i[1][9])

        

for x in activities_dict:
    activities_dict[x] = list(set(activities_dict[x]))

people = {}

for i in person_records:
    current_ID = i[1][9]
    if current_ID not in people:
        people.update({current_ID: {'numbers': [],
                                    'first_name': i[1][1],
                                    'middle_name': i[1][2],
                                    'last_name': i[1][3],
                                    'full_name': i[1][4],
                                    'original_IDs': [],
                                    'activities': [],
                                    'collaborators': []}
                      })
        
    people[current_ID]['numbers'].append(i[0])
    people[current_ID]['original_IDs'].append(i[1][0])
    people[current_ID]['activities'].append(i[1][6])
    people[current_ID]['collaborators'].extend(activities_dict[i[1][6]])
        
    
for x in people:
    people[x]['collaborators'] = list(set(people[x]['collaborators']))
    people[x]['collaborators'].remove(x)    
    
title_list = []
person_list = []
for person in people:
    title_list.append(people[person]['activities'])
    person_list.append(person)
    

final_title_list = []
for person_articles in title_list:
    building = []
    for article in person_articles:
        for x in publication_dict:
            if article == x:
                building.append(publication_dict[x])
    if building == []:
        final_title_list.append('No Title')
    else:
        final_title_list.append(' '.join(building))
        

# Identifying Similar Authors

### Setting parameters for author cosine simiarities

In [None]:
collab_threshold = 0 #Set to 0 if only cosine simiarities between authors collaborating on 0% of publications should be found
top_number = 5 #Number of 'closest' authors to be included in export
similarity_minimum = 0.2 #Minimum cosine similarity to be included in export

### Getting cosine similarities for author corpi

In [115]:
%%time
#######Cosine similarity calculations etc
all_titles = final_title_list


title_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words='english' )
title_vectorizer.fit(all_titles)
print('Title vectorizer fit...', datetime.datetime.now().time())

title_cosine_similarities = cosine_similarity(title_vectorizer.fit_transform(all_titles))
print('Title cosine similarities calculated...', datetime.datetime.now().time())

Title vectorizer fit... 20:00:32.583286
Title cosine similarities calculated... 20:00:33.097641
Wall time: 771 ms


### Calculating new cosine simiarities after filtering out according to selected collab_threshold

In [118]:
%%time

collaboration_percentage_array = np.zeros((len(person_list), len(person_list)))
for x, y in np.ndenumerate(collaboration_percentage_array):
    collaboration_percentage_array[x[0], x[1]] = len(set(people[person_list[x[0]]]['activities'])&set(people[person_list[x[1]]]['activities'])) / len(set(people[person_list[x[0]]]['activities']))* 100
    
collaboration_threshold_array = np.zeros((len(person_list), len(person_list)))
for x in range(len(person_list)):
    for y in range(len(person_list)):
        if collaboration_percentage_array[x, y] <= collab_threshold:
            collaboration_threshold_array[x, y] = 1
            
filtered_title_cosine_similarities = np.multiply(title_cosine_similarities, collaboration_threshold_array)
            

Wall time: 1min 22s


### Preparing the author cosine similarity export

In [226]:
top_number = -top_number
similarity_export_list = [['ID', 'Full Name', 'Publications', 'Number of Publications', 'Similar to ID', 'Similar to Full Name', 'Similar to Publications', 'Cosine Similarity', 'Collaboration Percentage']]


for index, person in enumerate(person_list):
    example = index
    minimum_value = np.partition(filtered_title_cosine_similarities[example], top_number)[top_number]
    top5 = np.nonzero((filtered_title_cosine_similarities[example] >= minimum_value) & (filtered_title_cosine_similarities[example] > similarity_minimum))
    top5_values = filtered_title_cosine_similarities[example][top5]
    for top5_index, match_index  in enumerate(top5[0]):

        building = []
        pubs_building = []

        #same each time
        building.append(person)
        building.append(people[person]['full_name'])
        for pub in people[person]['activities']:
            pubs_building.append(publication_dict[pub])
        pubs_built = '\n'.join(pubs_building)
        building.append(pubs_built)
        building.append(len(people[person]['activities']))

        #different for each match        
        building.append(person_list[match_index])
        building.append(people[person_list[match_index]]['full_name'])
        pubs_building = []
        for pub in people[person_list[match_index]]['activities']:
            pubs_building.append(publication_dict[pub])
        pubs_built = '\n'.join(pubs_building)
        building.append(pubs_built)
        building.append(top5_values[top5_index])
        building.append(len(set(people[person]['activities'])&set(people[person_list[match_index]]['activities']))/len(people[person]['activities'])*100)

        similarity_export_list.append(building)
        
len(similarity_export_list)

with open('Filtered Similarity Export AML.csv','w', newline='', encoding = "UTF-8") as dunno:
    writer = csv.writer(dunno)
    writer.writerows(similarity_export_list)
    
# minimum_value = np.partition(filtered_title_cosine_similarities[example], -5)[-5]
# minimum_value_indexes = np.nonzero(filtered_title_cosine_similarities[example] >= minimum_value)
#cosine_list = [(i, j) for i, j in zip(yikes[0], yikes[1])]

12273