In [1]:
import numpy as np
import pandas as pd
from thefuzz import fuzz
from thefuzz import process


In [26]:
base_names_df = pd.read_csv('base_names.csv')  
variations_df = pd.read_csv('name_variations.csv')  

def fuzzy_match(variation, base_names, threshold=80):  # Adjust threshold as needed

    best_match_token, score_token = process.extractOne(variation, base_names, scorer=fuzz.token_sort_ratio)    
    best_match_partial, score_partial = process.extractOne(variation, base_names, scorer=fuzz.partial_ratio)
    
    if score_token >= score_partial:
        best_match, best_score = best_match_token, score_token
    else:
        best_match, best_score = best_match_partial, score_partial
    
    if best_score >= threshold:
        return best_match, best_score
    else:
        return None, None

matches = []
correct_matches = 0
total_matches = len(variations_df)

base_names = base_names_df['Base_Name'].tolist()

for idx, row in variations_df.iterrows():
    variation = row['Variation']
    actual_match = row['Matches_With_Base_Name']
    
    best_match, score = fuzzy_match(variation, base_names, threshold=80)  # Adjust threshold as needed
    matches.append((variation, best_match, score))

    if best_match == actual_match:
        correct_matches += 1

accuracy = correct_matches / total_matches * 100

results_df = pd.DataFrame(matches, columns=['Variation', 'Best_Match', 'Score'])

final_df = results_df.merge(base_names_df, left_on='Best_Match', right_on='Base_Name', how='left')

final_df[['Variation', 'Best_Match', 'Score']].to_csv('matched_names_combined_fuzzy.csv', index=False)

print(f"Fuzzy matching complete. Accuracy: {accuracy:.2f}%")


Fuzzy matching complete. Accuracy: 100.00%


In [27]:
final_df

Unnamed: 0,Variation,Best_Match,Score,Base_Name_ID,Base_Name
0,Thomas King,Thomas King,100,15,Thomas King
1,ThomasKing,Thomas King,90,15,Thomas King
2,Maria Garcia,Maria Garcia,100,4,Maria Garcia
3,MaryLewis,Mary Lewis,89,12,Mary Lewis
4,Nancy W.,Nancy Wright,100,16,Nancy Wright
...,...,...,...,...,...
95,Jennifer- Brown,Jennifer Brown,100,2,Jennifer Brown
96,Daniel- Scott,Daniel Scott,100,17,Daniel Scott
97,David M.,David Martinez,100,9,David Martinez
98,Paul Allen.,Paul Allen,100,13,Paul Allen
