# Introduction

Coming up with a name mangler for fullnames based on techniques mentioned [here](https://www.rosette.com/blog/overview-fuzzy-name-matching-techniques/)

In [190]:
import random
import numpy as np
import pandas as pd
# from scipy.optimize import basinhopping
# from sklearn.metrics import mutual_info_score
from tqdm import tqdm
import time
import re

In [191]:
tqdm.pandas()

In [180]:
name_df = pd.read_csv("../../data/namelist.csv")

In [181]:
# Remove leading and trailing whitespace (there isn't any but just in case)
name_df['Name'] = name_df['Name'].str.strip()
# Separate into firstname and lastname
# name_df[['firstname', 'lastname']]=name_df['Name'].str.split(" ", expand=True)

In [182]:
name_df.head(2)

Unnamed: 0,Name
0,Tristian Wunsch
1,Rosamond Klocko


# Dirtying ideas

Some of the cleanup methods mentioend [here](https://www.rosette.com/blog/overview-fuzzy-name-matching-techniques/), can be reversed to formualate certain methods for "dirtying" the names. 

These include : 

1. Inserting / Deleting spaces, adding hyphens / periods 
    - Mary Ellen -> MaryEllen / Mary-Ellen / Mary. Ellen
<br><br>
2. Adding Titles and honorifics. Can Either Be at the start or at the end
    - Tim Jones -> Tim Jones Ph.D / Dr. Tim Jones / Mr. Tim Jones / Tim Jones, MD
<br><br>    
3. Adding hyphens for Surnames with two parts, or hyphen after a vowel
    - Ronald McDonald -> Ronald Mc-Donald
    - Huang Bien -> Huang Bi-En
<br><br>

Any of these 3 methods can be called multiple times on the same name. For example : 

Ronald McDonald -*M2*-> Dr. Ronald McDonald -*M3*-> Dr. Ronald Mc-Donald -*M1*-> Dr. Ronald. Mc-Donald
-*M2*-> Dr. Ronald. Mc-Donald, MD

# Tracking correct names

To track what the original correct name was, we can have a unique identifier field for each name in a row in the data. Just need to ensure the unique identifier isn't altered.

# Implementation



In [185]:
def mangle_1(name_dict, cands = [".", "-"], remove_space_thresh=0.4):
    cand = name_dict['midgap']
    rm_whitespace_prob = random.uniform(0,1)
    if " " in cand and rm_whitespace_prob < remove_space_thresh:
        name_dict['midgap'] = re.sub("\s", "", cand, count=1) #Remove only 1st whitespace

    else : 
        name_dict['midgap'] =  random.choice(cands)+cand 
    return name_dict


def mangle_2(name_dict, front_cands = ['Dr. ', 'Mr. ', 'Mrs. '], 
             back_cands = [", M.D.", ", PHD", ", CFA"], remove_space_thresh=0.4):
    firstgap = name_dict['firstgap']
    lastgap = name_dict['lastgap']
    rm_whitespace_prob = random.uniform(0,1)
    if " " in firstgap and rm_whitespace_prob < remove_space_thresh:
        name_dict['firstgap'] = re.sub("\s", "", firstgap, count=1) #Remove only 1st whitespace
    
    elif " " in lastgap and rm_whitespace_prob < remove_space_thresh:
        name_dict['lastgap'] = re.sub("\s", "", lastgap, count=1) #Remove only 1st whitespace
    
    else:
        # Add to either the front or back
        if random.uniform(0,1) < 0.5:
            # Add to front
            name_dict['firstgap'] = random.choice(front_cands) + firstgap
        else:
            # Add to back
            name_dict['lastgap'] = random.choice(back_cands) + lastgap
    
    return name_dict

def mangle_3(name_dict):
    firstname, lastname = name_dict['firstname'], name_dict['lastname']
    
    # Try to get two groups from lastname (eg. McLeod -> ('Mc', "Leod"))
    try:
        grps = re.search(r"([A-Z][a-z]+)([A-Z][a-z]+)", lastname).groups()
        # If succesful, insert a hyphen
        name_dict['lastname'] = "-".join(grps)
        
    except:
        # Otherwise look for a vowel condition
        last_grp = re.search(r"(\w*?[^aeiou]+[aeiou])(\w+)", lastname)
        first_grp = re.search(r"(\w*?[^aeiou]+[aeiou])(\w+)", firstname)
        
        if last_grp is not None:
            last_grp=last_grp.groups()
            name_dict['lastname'] = "-".join(last_grp)
            
        elif first_grp is not None:
            first_grp=first_grp.groups()
            name_dict['firstname'] = "-".join(first_grp)
        
        return name_dict                   
            
        
            
        
    
        

def mangle_names(fullname, n_iter=5, cand_fns = [mangle_1, mangle_2, mangle_3]):
    firstname, lastname = fullname.split(" ")
    name_dict = {
        'firstgap' : "",
        'firstname' : firstname,
        'midgap' : " ",
        'lastname' : lastname,
        'lastgap' : ""
    }
    for i in range(n_iter):
        mangler_fn = random.choice(cand_fns)
        name_dict = mangler_fn(name_dict)
        
       
    return "".join(name_dict.values())
    
    
    
    

# Trial

Trial on the 100 names with n_iter = 5

In [203]:
name_df['mangled'] = name_df['Name'].apply(lambda x : mangle_names(x))

In [204]:
name_df

Unnamed: 0,Name,mangled
0,Tristian Wunsch,Mr. Tri-stian-Wu-nsch
1,Rosamond Klocko,Dr.Ro-samond. Klo-cko
2,Georgianna Bahringer,Mr. Georgianna-. Ba-hri-nger
3,Destiney Gutkowski,Mr. Destiney-- Gu-tko-wski
4,Glenda Berge,Gle-nda--. Be-rge
...,...,...
94,Terence Windler,"Terence.- Windler,M.D., PHD"
95,Ocie Pacocha,Ocie-.Pa-co-cha
96,Zechariah Conn,Dr.Ze-chariah- Co-nn
97,Lukas Gleichner,Dr. Mr. Lukas-. Gle-ichner
