In [9]:
import pandas as pd
import Levenshtein # levenshtein distance
from sklearn.feature_extraction.text import TfidfVectorizer # cosine similarity
from sklearn.metrics.pairwise import cosine_similarity # cosine similarity
import jellyfish # soundex

In [10]:
# import names data - premier league players
data = pd.read_csv('names_data.csv')
data.head()

Unnamed: 0,forename_a,surname_a,id_a,forename_b,surname_b,id_b
0,Aaron,Anthony Connolly,456123,Aaron,Anthony Connolly,456123
1,Aaron,Cresswell,456124,Aran,Cresswell,456124
2,Aaron,Ramsdale,456125,Aaron,Ramsdale,456125
3,Aaron,Wan-Bissaka,456126,Aaron,Wan-Bissaka,456126
4,Abd-Al-Ali,Morakinyo Olaposi Koiki,456127,Abd-Al-Ali,Morakinyo Olaposi Koiki,456127


In [11]:
# combine name and psuedo ID number into strings
data['combined_a'] = data['forename_a'] + ' ' + data['surname_a'] + ' ' + data['id_a'].astype(str)
data['combined_b'] = data['forename_b'] + ' ' + data['surname_b'] + ' ' + data['id_b'].astype(str)
data.head()

Unnamed: 0,forename_a,surname_a,id_a,forename_b,surname_b,id_b,combined_a,combined_b
0,Aaron,Anthony Connolly,456123,Aaron,Anthony Connolly,456123,Aaron Anthony Connolly 456123,Aaron Anthony Connolly 456123
1,Aaron,Cresswell,456124,Aran,Cresswell,456124,Aaron Cresswell 456124,Aran Cresswell 456124
2,Aaron,Ramsdale,456125,Aaron,Ramsdale,456125,Aaron Ramsdale 456125,Aaron Ramsdale 456125
3,Aaron,Wan-Bissaka,456126,Aaron,Wan-Bissaka,456126,Aaron Wan-Bissaka 456126,Aaron Wan-Bissaka 456126
4,Abd-Al-Ali,Morakinyo Olaposi Koiki,456127,Abd-Al-Ali,Morakinyo Olaposi Koiki,456127,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,Abd-Al-Ali Morakinyo Olaposi Koiki 456127


## Method 1: Jaccard

In [12]:
def jaccard_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# calc jaccard similarity of combined strings
data['jaccard_similarity'] = data.apply(lambda row: jaccard_similarity(row['combined_a'], row['combined_b']), axis=1)
data[['combined_a', 'combined_b', 'jaccard_similarity']].head()

Unnamed: 0,combined_a,combined_b,jaccard_similarity
0,Aaron Anthony Connolly 456123,Aaron Anthony Connolly 456123,1.0
1,Aaron Cresswell 456124,Aran Cresswell 456124,0.5
2,Aaron Ramsdale 456125,Aaron Ramsdale 456125,1.0
3,Aaron Wan-Bissaka 456126,Aaron Wan-Bissaka 456126,1.0
4,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,1.0


## Method 2: Cosine Similarity

In [15]:
# create tf-idf vectors for combined name strings
# term frequency-inverse document frequency
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['combined_a'].tolist() + data['combined_b'].tolist())

# calc the cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix[:len(data)], tfidf_matrix[len(data):])
data['cosine_similarity'] = [cosine_similarities[i][i] for i in range(len(cosine_similarities))]

data[['combined_a', 'combined_b', 'cosine_similarity']].head()

Unnamed: 0,combined_a,combined_b,cosine_similarity
0,Aaron Anthony Connolly 456123,Aaron Anthony Connolly 456123,1.0
1,Aaron Cresswell 456124,Aran Cresswell 456124,0.684849
2,Aaron Ramsdale 456125,Aaron Ramsdale 456125,1.0
3,Aaron Wan-Bissaka 456126,Aaron Wan-Bissaka 456126,1.0
4,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,1.0


## Method 3: Leveshtein Distance

In [14]:
# calc levenshtein distance between the combined strings
data['levenshtein_distance'] = data.apply(lambda row: Levenshtein.distance(row['combined_a'], row['combined_b']), axis=1)

data[['combined_a', 'combined_b', 'levenshtein_distance']].head()

Unnamed: 0,combined_a,combined_b,levenshtein_distance
0,Aaron Anthony Connolly 456123,Aaron Anthony Connolly 456123,0
1,Aaron Cresswell 456124,Aran Cresswell 456124,2
2,Aaron Ramsdale 456125,Aaron Ramsdale 456125,0
3,Aaron Wan-Bissaka 456126,Aaron Wan-Bissaka 456126,0
4,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,0


## Method 4: Soundex

In [16]:
# calc soundex code for combined strings
data['soundex_a'] = data['combined_a'].apply(jellyfish.soundex)
data['soundex_b'] = data['combined_b'].apply(jellyfish.soundex)

# Determine if the Soundex codes are the same
data['soundex_match'] = data['soundex_a'] == data['soundex_b']

data[['combined_a', 'combined_b', 'soundex_a', 'soundex_b', 'soundex_match']].head()

Unnamed: 0,combined_a,combined_b,soundex_a,soundex_b,soundex_match
0,Aaron Anthony Connolly 456123,Aaron Anthony Connolly 456123,A655,A655,True
1,Aaron Cresswell 456124,Aran Cresswell 456124,A652,A652,True
2,Aaron Ramsdale 456125,Aaron Ramsdale 456125,A656,A656,True
3,Aaron Wan-Bissaka 456126,Aaron Wan-Bissaka 456126,A655,A655,True
4,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,A134,A134,True


### View different similarity measures

In [18]:
data.head(50)

Unnamed: 0,forename_a,surname_a,id_a,forename_b,surname_b,id_b,combined_a,combined_b,jaccard_similarity,cosine_similarity,levenshtein_distance,soundex_a,soundex_b,soundex_match
0,Aaron,Anthony Connolly,456123,Aaron,Anthony Connolly,456123,Aaron Anthony Connolly 456123,Aaron Anthony Connolly 456123,1.0,1.0,0,A655,A655,True
1,Aaron,Cresswell,456124,Aran,Cresswell,456124,Aaron Cresswell 456124,Aran Cresswell 456124,0.5,0.684849,2,A652,A652,True
2,Aaron,Ramsdale,456125,Aaron,Ramsdale,456125,Aaron Ramsdale 456125,Aaron Ramsdale 456125,1.0,1.0,0,A656,A656,True
3,Aaron,Wan-Bissaka,456126,Aaron,Wan-Bissaka,456126,Aaron Wan-Bissaka 456126,Aaron Wan-Bissaka 456126,1.0,1.0,0,A655,A655,True
4,Abd-Al-Ali,Morakinyo Olaposi Koiki,456127,Abd-Al-Ali,Morakinyo Olaposi Koiki,456127,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,Abd-Al-Ali Morakinyo Olaposi Koiki 456127,1.0,1.0,0,A134,A134,True
5,Adam,Idah,456128,Adam,Idah,456128,Adam Idah 456128,Adam Idah 456128,1.0,1.0,0,A353,A353,True
6,Adam,Lallana,456129,Adam,Lallana,456129,Adam Lallana 456129,Adam Lallana 456129,1.0,1.0,0,A354,A354,True
7,Adam,Masina,456130,Adam,Masina,456130,Adam Masina 456130,Adam Masina 456130,1.0,1.0,0,A355,A355,True
8,Adam,Smith,456131,Adam,Smith,456131,Adam Smith 456131,Adam Smith 456131,1.0,1.0,0,A352,A352,True
9,Adam,Webster,456132,Adam,Webster,456132,Adam Webster 456132,Adam Webster 456132,1.0,1.0,0,A351,A351,True
