# Siamese model for name ambiguation

Here, in this notebook, we will create a model that can tell the difference between "Paul" & "Paula". This model can be used as a drop in replacement to string comparison methods which can return score from 0 to 1 depending upon the similarity. For example, here is the intended use:

> compare("Paul", "Paula") -> 0.3  # It indicates different gender


In [None]:
import pandas as pd

In [None]:
alt_names = pd.read_csv('../data/name_pairs.txt', sep=',', names=['name_a','name_b'], header=None)

In [None]:
alt_names.sample(10)

Unnamed: 0,name_a,name_b
16048,Tine,Martin
8327,Kristin,Krissy
305,Ale,Alisha
2914,Clifton,Tone
4824,Foncho,Fon
13622,Radisa,Radomirm
2792,Chus,Suso
4543,Erma,Em
9708,Madzia,Magdzia
2643,Chema,Josemaria


In [None]:
import re


def syllables(word):
    # single syllable word
    if len(re.findall('[aeiouy]', word)) <= 1:
        return [word]

    # sonority hierarchy: vowels, nasals, fricatives, stops
    hierarchy = {
        'a': 4, 'e': 4, 'i': 4, 'o': 4, 'u': 4, 'y': 4,
        'l': 3, 'm': 3, 'n': 3, 'r': 3, 'w': 3,
        'f': 2, 's': 2, 'v': 2, 'z': 2,
        'b': 1, 'c': 1, 'd': 1, 'g': 1, 'h': 1, 'j': 1, 'k': 1, 'p': 1, 'q': 1, 't': 1, 'x': 1,
    }
    syllables_values = [(c, hierarchy[c]) for c in word]

    syllables = []
    syll = syllables_values[0][0]
    for trigram in zip(*[syllables_values[i:] for i in range(3)]):
        (phonemes, values) = zip(*trigram)
        (previous, val, following) = values
        phoneme = phonemes[1]

        if previous > val < following:
            syllables.append(syll)
            syll = phoneme
        elif previous >= val == following:
            syll += phoneme
            syllables.append(syll)
            syll = ''
        else:
            syll += phoneme
    syll += syllables_values[-1][0]
    syllables.append(syll)

    final_syllables = []
    front = ''
    for (i, syllable) in enumerate(syllables):
        if not re.search('[aeiouy]', syllable):
            if len(final_syllables) == 0:
                front += syllable
            else:
                final_syllables = final_syllables[:-1] \
                                  + [final_syllables[-1] + syllable]
        else:
            if len(final_syllables) == 0:
                final_syllables.append(front + syllable)
            else:
                final_syllables.append(syllable)
    return final_syllables

In [None]:
import unidecode
from fuzzywuzzy import fuzz

from abydos.distance import (IterativeSubString, BISIM, DiscountedLevenshtein, Prefix, LCSstr, MLIPNS, Strcmp95,
MRA, Editex, SAPS, FlexMetric, JaroWinkler, HigueraMico, Sift4, Eudex, ALINE, Covington, PhoneticEditDistance)

from abydos.phonetic import PSHPSoundexFirst, Ainsworth
pshp_soundex_first = PSHPSoundexFirst()
pe = Ainsworth()



In [None]:
from abydos.phones import *

In [None]:
iss = IterativeSubString()
bisim = BISIM()
dlev = DiscountedLevenshtein()
prefix = Prefix()
lcs = LCSstr()
mlipns = MLIPNS()
strcmp95 = Strcmp95()
mra = MRA()
editex = Editex()
saps = SAPS()
flexmetric = FlexMetric()
jaro = JaroWinkler(mode='Jaro')
higuera_mico = HigueraMico()
sift4 = Sift4()
eudex = Eudex()
aline = ALINE()
covington = Covington()
phonetic_edit = PhoneticEditDistance()

In [None]:
algos = [iss, bisim, dlev, prefix, lcs, mlipns, strcmp95, mra, editex, saps, flexmetric, jaro, higuera_mico, sift4, eudex,
         aline, covington, phonetic_edit]

algo_names = ['iterativesubstring', 'bisim', 'discountedlevenshtein', 'prefix', 'lcsstr', 'mlipns', 'strcmp95', 'mra',
              'editex', 'saps', 'flexmetric', 'jaro', 'higueramico', 'sift4', 'eudex', 'aline', 'covington',
              'phoneticeditdistance']

In [None]:
def sum_ipa(name_a, name_b):
    feat1 = ipa_to_features(pe.encode(name_a))
    feat2 = ipa_to_features(pe.encode(name_b))
    score = sum(cmp_features(f1, f2) for f1, f2 in zip(feat1, feat2))/len(feat1)
    return score

In [None]:
import re
def featurize(df):
    if len(df.columns)==3:
        df.columns=['a', 'b', 'target']
    elif len(df.columns)==2:
        df.columns=['a', 'b']
    else:
        df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' })
        
    df['name_a'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower().strip()), axis=1)
    df['name_b'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower().strip()), axis=1)
    
    df['syll_a'] = df.apply(lambda row: syllables(row.name_a), axis=1)
    df['syll_b'] = df.apply(lambda row: syllables(row.name_b), axis=1)
    
    df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.syll_a,row.syll_b), axis=1)
    df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.syll_a,row.syll_b), axis=1)
    df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.syll_a,row.syll_b), axis=1)
    
    df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.name_a, row.name_b), axis=1)
    
    df['pshp_soundex_first'] = df.apply(
        lambda row: 1 if pshp_soundex_first.encode(row.name_a)==pshp_soundex_first.encode(row.name_b) else 0, axis=1)
    
    for i, algo in enumerate(algos):
            df[algo_names[i]] = df.apply(lambda row: algo.sim(row.name_a, row.name_b), axis=1)
            
    df.drop(['syll_a', 'syll_b'], axis=1, inplace=True)
    return df

In [None]:
# Positive Class
alt_names['target'] = 1

In [None]:
from itertools import combinations
import random
random.seed(30)

# Use combinatorics to generate negative class
all_names = alt_names.loc[:, 'name_a':'name_b'].values.tolist()
unique_names = list(set([item for items in all_names for item in items]))
alt_pairs = list(zip(alt_names.name_a, alt_names.name_b))+ list(zip(alt_names.name_b, alt_names.name_a))
comb = list(combinations(unique_names, 2))
non_alt = list(set(comb) - set(alt_pairs))
# Undersample the negative class for 1:4 class imbalance instead of 1:1000 extreme class imbalance
non_alt = pd.DataFrame(random.choices(non_alt, k=70040), columns=['name_a', 'name_b'])

In [None]:
print('positive class ratio 1:{}'.format(int(len(non_alt)/len(alt_names))))


positive class ratio 1:4


In [None]:
# Negative Class
non_alt['target'] = 0
df = pd.concat([alt_names, non_alt])
non_alt = None
alt_names = None

In [None]:
df = featurize(df)

In [None]:
df.sample(5)

Unnamed: 0,a,b,target,name_a,name_b,partial,tkn_sort,tkn_set,sum_ipa,pshp_soundex_first,...,editex,saps,flexmetric,jaro,higueramico,sift4,eudex,aline,covington,phoneticeditdistance
51948,Shah,Tashe,0,shah,tashe,53,20,20,0.83871,0,...,0.3,0.0,0.46,0.633333,0.3,0.4,0.933824,0.592593,0.622222,0.587097
15602,Aravindha,Kim,0,aravindha,kim,43,13,13,0.261649,0,...,0.222222,0.0,0.255556,0.481481,0.0,0.111111,0.745098,0.187234,0.3,0.28853
4102,Bart,Almina,0,bart,almina,50,17,17,0.655914,0,...,0.083333,0.0,0.116667,0.472222,0.061905,0.166667,0.846078,0.246667,0.377551,0.540323
1384,Tadzio,Cecalie,0,tadzio,cecalie,67,12,12,0.760753,0,...,0.285714,0.0,0.528571,0.539683,0.232143,0.285714,0.922549,0.515152,0.546154,0.75576
2417,Cat,Kate,1,cat,kate,57,50,50,1.0,1,...,0.625,0.0,0.85,0.722222,0.5,0.5,0.868627,0.7,0.671429,0.741935


In [None]:
y = df.target
X = df.drop('target',1)

  X = df.drop('target',1)


# Model building

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler


In [None]:
base_model_1 = make_pipeline(
    MaxAbsScaler(),
    MinMaxScaler(),
    RandomForestClassifier(
                bootstrap=False,
            criterion="gini",
            max_features=0.25,
            min_samples_leaf=1,
            min_samples_split=4,
            n_estimators=100
    )
)

In [None]:
from sklearn.model_selection import StratifiedKFold


stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
fold = 1


scores = []

for fold, (train_index, test_index) in enumerate(stratified_kfold.split(X,y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    oof_pred = X_test[['name_a', 'name_b']]
    base_model_1.fit(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1), y_train)
    
    oof_pred['predict_proba'] = base_model_1.predict_proba(X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))
    
    oof_pred['target'] = y_test.tolist()
    
    print('completed fold {} of 10'.format(fold))
    scores.append()

  base_model_1.fit(X_train.drop(['a', 'b', 'name_a', 'name_b'], 1), y_train)
  oof_pred['predict_proba'] = base_model_1.predict_proba(X_test.drop(['a', 'b', 'name_a', 'name_b'], 1))


ValueError: Wrong number of items passed 2, placement implies 1