In [2]:
'''
Merge athletes who were not found on the team page that match an 
existing athlete in the DB.
'''

import pandas as pd
from util.db_util import Database

db = Database("db/Track.db")

# get athletes that have first = last and grad_year = 9999
# if we don't find an athlete when web scrapping, we put the full
# name in for both first and last
df = db.get_problem_athletes()

results = []

for index, row in df.iterrows():
    full_name = row["first"].strip().split()
    
    if len(full_name) >= 2:
        first = full_name[0]
        last = " ".join(full_name[1:])
        
        school_id = row["school_id"]
    
        bad_id = row["athlete_id"]

        # check for matching athlete 
        good_id = db.get_athlete_id_wo_grad_year(first, last, school_id)

        if good_id is not None:
            athlete_df = db.get_athlete(good_id)
            results.append((good_id, bad_id))
    else:
        print("BAD")
        print(full_name)

results_df = pd.DataFrame(results, columns=["Good", "Bad"])
print(results_df)

file = f"Merge ID Results.csv"
results_df.to_csv(file, index=False)


Empty DataFrame
Columns: [Good, Bad]
Index: []


In [7]:
for index, row in results_df.iterrows():
    good_id, bad_id = int(row["Good"]), int(row["Bad"])
    db.merge_athlete(good_id, bad_id)


In [4]:
'''
Merge athletes who are in the database with the same first, last,
school_id but different grad_years (within 4 years of each other). In 
this case, keep the athlete_id with the highest grad_year. 
'''

import pandas as pd
from util.db_util import Database

db = Database("db/Track.db")

# 1. Load all athletes
df = pd.read_sql_query("""
    SELECT athlete_id, first, last, gender, school_id, grad_year
    FROM athlete
""", db.conn)

# 2. Group by identity fields
groups = df.groupby(["first", "last", "gender", "school_id"])

merge_count = 0

for _, group in groups:
    if len(group) <= 1:
        continue

    # 3. Pick athlete with highest grad_year to keep
    keep_row = group.loc[group["grad_year"].idxmax()]
    keep_id = int(keep_row["athlete_id"])

    # 4. Merge all others into keep_id
    for _, row in group.iterrows():
        bad_id = int(row["athlete_id"])

        if (bad_id == keep_id):
            continue

        if int(keep_row["grad_year"]) == 9999:
            new_bad_id = keep_id
            keep_id = bad_id
            bad_id = new_bad_id
        elif abs(int(keep_row["grad_year"]) - int(row["grad_year"])) > 4:
            continue

        print(keep_id, bad_id)
        #db.merge_athlete(keep_id, bad_id)
        merge_count += 1

print(f"Done. Merged {merge_count} duplicate athlete records.")


Done. Merged 0 duplicate athlete records.


In [2]:
"""
Do a manual athlete merge.
"""

from util.db_util import Database

db = Database("db/Track.db")

db.merge_athlete(301208, 301209)

In [4]:
"""
Finds possible duplicate athletes using fuzzy name matching
and lets the user decide which to merge.
"""

import pandas as pd
from fuzzywuzzy import fuzz

# ==============================================================================
# CONFIGURATION
# ==============================================================================

from util.db_util import Database
db = Database("db/Track.db")

SIMILARITY_THRESHOLD = 85

# ==============================================================================
# LOAD DATA
# ==============================================================================

df = pd.read_sql_query("""
    SELECT athlete_id, first, last, gender, school_id, grad_year
    FROM athlete
""", db.conn)

def delete_from_relay(athlete_id):
    """
    Delete all entries from relay_athlete table for a given athlete_id
    """
    query = "DELETE FROM relay_athlete WHERE athlete_id = ?"
    db.conn.execute(query, (athlete_id,))
    db.conn.commit()

# ==============================================================================
# FUZZY MATCHING
# ==============================================================================

def get_name_similarity(name1, name2):
    return fuzz.ratio(name1.lower(), name2.lower())

def find_duplicates(df, threshold):
    duplicates = []

    grouped = df.groupby(['gender', 'school_id'])

    for (gender, school_id), group in grouped:
        athletes = group.reset_index(drop=True)

        for i in range(len(athletes)):
            for j in range(i + 1, len(athletes)):
                a1 = athletes.iloc[i]
                a2 = athletes.iloc[j]

                first_sim = get_name_similarity(a1['first'], a2['first'])
                last_sim = get_name_similarity(a1['last'], a2['last'])

                if first_sim >= threshold and last_sim >= threshold:
                    duplicates.append({
                        'athlete1_id': a1['athlete_id'],
                        'athlete1_first': a1['first'],
                        'athlete1_last': a1['last'],
                        'athlete1_grad_year': a1['grad_year'],
                        'athlete2_id': a2['athlete_id'],
                        'athlete2_first': a2['first'],
                        'athlete2_last': a2['last'],
                        'athlete2_grad_year': a2['grad_year'],
                        'gender': gender,
                        'school_id': school_id,
                        'first_similarity': first_sim,
                        'last_similarity': last_sim,
                        'avg_similarity': (first_sim + last_sim) / 2
                    })

    return pd.DataFrame(duplicates)

# ==============================================================================
# FIND DUPLICATES
# ==============================================================================

duplicates_df = find_duplicates(df, SIMILARITY_THRESHOLD)

print("\n" + "=" * 80)
print(f"POTENTIAL DUPLICATES FOUND: {len(duplicates_df)}")
print("=" * 80 + "\n")

if duplicates_df.empty:
    print("No potential duplicates found.")
    exit()

duplicates_df = duplicates_df.sort_values('avg_similarity', ascending=False)
duplicates_df.to_csv('potential_duplicates.csv', index=False)

print("Results exported to 'potential_duplicates.csv'\n")

# ==============================================================================
# INTERACTIVE MERGE
# ==============================================================================

merged_ids = set()
merge_count = 0

for _, dup in duplicates_df.iterrows():

    a1_id = int(dup['athlete1_id'])
    a2_id = int(dup['athlete2_id'])

    if a1_id in merged_ids or a2_id in merged_ids:
        continue

    athlete1 = df[df['athlete_id'] == a1_id].iloc[0]
    athlete2 = df[df['athlete_id'] == a2_id].iloc[0]

    print("\n" + "-" * 80)
    print(f"Potential Duplicate (avg similarity: {dup['avg_similarity']:.1f}%)")

    print("\n Athlete 1 (KEEP TARGET):")
    print(f"   ID: {athlete1['athlete_id']}")
    print(f"   Name: {athlete1['first']} {athlete1['last']}")
    print(f"   Gender: {athlete1['gender']}")
    print(f"   School ID: {athlete1['school_id']}")
    print(f"   Grad Year: {athlete1['grad_year']}")

    print("\n Athlete 2:")
    print(f"   ID: {athlete2['athlete_id']}")
    print(f"   Name: {athlete2['first']} {athlete2['last']}")
    print(f"   Gender: {athlete2['gender']}")
    print(f"   School ID: {athlete2['school_id']}")
    print(f"   Grad Year: {athlete2['grad_year']}")

    choice = input("\nChoose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit): ").strip().lower()

    if choice == 'q':
        print("\nExiting merge process.")
        break

    if choice == 's':
        print("Skipped.")
        continue

    if choice not in ('1', '2'):
        print("Invalid choice. Skipping.")
        continue

    # --------------------------------------------------
    # MERGE LOGIC
    # --------------------------------------------------
    keep_id = a1_id
    bad_id = a2_id

    if choice == '1':
        keep_id = a1_id
        bad_id = a2_id
        print("Keeping 1st athlete, merging 2nd to 1st athlete")
    else:
        keep_id = a2_id
        bad_id = a1_id
        print("Keeping 2nd athlete, merging 1st to 2nd athlete")

    print(f"\nMerging athlete_id {bad_id} → {keep_id}")
    
    delete_from_relay(bad_id)
    db.merge_athlete(keep_id, bad_id)

    merged_ids.add(bad_id)
    merge_count += 1

print("\n" + "=" * 80)
print(f"Done. Merged {merge_count} athlete records.")
print("=" * 80)



POTENTIAL DUPLICATES FOUND: 21

Results exported to 'potential_duplicates.csv'


--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 96.0%)

 Athlete 1 (KEEP TARGET):
   ID: 306900
   Name: ALFONZO NEWBERN
   Gender: Boys
   School ID: 94
   Grad Year: 2026

 Athlete 2:
   ID: 307768
   Name: ALONZO NEWBERN
   Gender: Boys
   School ID: 94
   Grad Year: 2026



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 306900 → 307768

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 96.0%)

 Athlete 1 (KEEP TARGET):
   ID: 296471
   Name: BRAYDEN WUNDERLICH
   Gender: Boys
   School ID: 354
   Grad Year: 2026

 Athlete 2:
   ID: 304667
   Name: BRADEN WUNDERLICH
   Gender: Boys
   School ID: 354
   Grad Year: 2026



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 296471 → 304667

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 96.0%)

 Athlete 1 (KEEP TARGET):
   ID: 293463
   Name: ZSOPHIA SHARP
   Gender: Girls
   School ID: 335
   Grad Year: 2026

 Athlete 2:
   ID: 312077
   Name: ZOPHIA SHARP
   Gender: Girls
   School ID: 335
   Grad Year: 2027



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 293463 → 312077

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 96.0%)

 Athlete 1 (KEEP TARGET):
   ID: 293354
   Name: RILEY O'BRIEN
   Gender: Girls
   School ID: 206
   Grad Year: 2026

 Athlete 2:
   ID: 312750
   Name: RILEY OBRIEN
   Gender: Girls
   School ID: 206
   Grad Year: 2026



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 293354 → 312750

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 96.0%)

 Athlete 1 (KEEP TARGET):
   ID: 289968
   Name: NY'ASIA WHITLEY
   Gender: Girls
   School ID: 131
   Grad Year: 2026

 Athlete 2:
   ID: 301456
   Name: NYASIA WHITLEY
   Gender: Girls
   School ID: 131
   Grad Year: 2026



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 289968 → 301456

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 96.0%)

 Athlete 1 (KEEP TARGET):
   ID: 302970
   Name: MARKLIA SANDERS
   Gender: Girls
   School ID: 384
   Grad Year: 2026

 Athlete 2:
   ID: 303008
   Name: MARKIA SANDERS
   Gender: Girls
   School ID: 384
   Grad Year: 2026



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 302970 → 303008

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 96.0%)

 Athlete 1 (KEEP TARGET):
   ID: 301283
   Name: ELLIOT BROOKS
   Gender: Boys
   School ID: 346
   Grad Year: 2026

 Athlete 2:
   ID: 309575
   Name: ELLIOTT BROOKS
   Gender: Boys
   School ID: 346
   Grad Year: 2026



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 301283 → 309575

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 95.5%)

 Athlete 1 (KEEP TARGET):
   ID: 293229
   Name: KELLY STANLEY
   Gender: Girls
   School ID: 310
   Grad Year: 2025

 Athlete 2:
   ID: 311962
   Name: KELLEY STANLEY
   Gender: Girls
   School ID: 310
   Grad Year: 2025



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 293229 → 311962

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 95.5%)

 Athlete 1 (KEEP TARGET):
   ID: 305220
   Name: MALIK MOORE
   Gender: Boys
   School ID: 117
   Grad Year: 2027

 Athlete 2:
   ID: 310055
   Name: MAALIK MOORE
   Gender: Boys
   School ID: 117
   Grad Year: 2027



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 305220 → 310055

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 94.5%)

 Athlete 1 (KEEP TARGET):
   ID: 293012
   Name: LILLY KENNEDY
   Gender: Girls
   School ID: 124
   Grad Year: 2025

 Athlete 2:
   ID: 302986
   Name: LILY KENNEDY
   Gender: Girls
   School ID: 124
   Grad Year: 2025



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 293012 → 302986

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 94.5%)

 Athlete 1 (KEEP TARGET):
   ID: 291018
   Name: ALEA REAGAN
   Gender: Girls
   School ID: 129
   Grad Year: 2023

 Athlete 2:
   ID: 291085
   Name: ALEXA REAGAN
   Gender: Girls
   School ID: 129
   Grad Year: 2024



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 291018 → 291085

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 94.5%)

 Athlete 1 (KEEP TARGET):
   ID: 298297
   Name: SIMON NICKLESON
   Gender: Boys
   School ID: 201
   Grad Year: 2026

 Athlete 2:
   ID: 305613
   Name: SIMON NICKELSON
   Gender: Boys
   School ID: 201
   Grad Year: 2026



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 298297 → 305613

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 94.5%)

 Athlete 1 (KEEP TARGET):
   ID: 294879
   Name: LILLY COX
   Gender: Girls
   School ID: 234
   Grad Year: 2025

 Athlete 2:
   ID: 311554
   Name: LILY COX
   Gender: Girls
   School ID: 234
   Grad Year: 2025



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 294879 → 311554

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 94.5%)

 Athlete 1 (KEEP TARGET):
   ID: 303636
   Name: KATE WOLFE
   Gender: Girls
   School ID: 258
   Grad Year: 2025

 Athlete 2:
   ID: 312376
   Name: KATIE WOLFE
   Gender: Girls
   School ID: 258
   Grad Year: 2028



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 303636 → 312376

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 94.5%)

 Athlete 1 (KEEP TARGET):
   ID: 294498
   Name: JERI HOKE-MARTIN
   Gender: Girls
   School ID: 28
   Grad Year: 2024

 Athlete 2:
   ID: 303678
   Name: JERRI HOKE-MARTIN
   Gender: Girls
   School ID: 28
   Grad Year: 2024



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 294498 → 303678

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 94.0%)

 Athlete 1 (KEEP TARGET):
   ID: 290026
   Name: KRISTINA GIL
   Gender: Girls
   School ID: 6
   Grad Year: 2026

 Athlete 2:
   ID: 301499
   Name: KRYSTINA GIL
   Gender: Girls
   School ID: 6
   Grad Year: 2026



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 290026 → 301499

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 94.0%)

 Athlete 1 (KEEP TARGET):
   ID: 297613
   Name: JACK HABAGGER
   Gender: Boys
   School ID: 116
   Grad Year: 2025

 Athlete 2:
   ID: 308024
   Name: JACK HABEGGER
   Gender: Boys
   School ID: 116
   Grad Year: 2025



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 297613 → 308024

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 94.0%)

 Athlete 1 (KEEP TARGET):
   ID: 294031
   Name: ANABELLE HERTWICK
   Gender: Girls
   School ID: 184
   Grad Year: 2026

 Athlete 2:
   ID: 312251
   Name: ANABELLE HERTWECK
   Gender: Girls
   School ID: 184
   Grad Year: 2026



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 294031 → 312251

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 93.0%)

 Athlete 1 (KEEP TARGET):
   ID: 301007
   Name: WESTON GRIENER
   Gender: Boys
   School ID: 348
   Grad Year: 2025

 Athlete 2:
   ID: 310094
   Name: WESTON GREINER
   Gender: Boys
   School ID: 348
   Grad Year: 2025



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 301007 → 310094

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 93.0%)

 Athlete 1 (KEEP TARGET):
   ID: 290013
   Name: ZOE SCOTT
   Gender: Girls
   School ID: 6
   Grad Year: 2024

 Athlete 2:
   ID: 301493
   Name: ZOE' SCOTT
   Gender: Girls
   School ID: 6
   Grad Year: 2024



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  1


Keeping 1st athlete, merging 2nd to 1st athlete

Merging athlete_id 301493 → 290013

--------------------------------------------------------------------------------
Potential Duplicate (avg similarity: 93.0%)

 Athlete 1 (KEEP TARGET):
   ID: 297497
   Name: RASHADD BINGHAM
   Gender: Boys
   School ID: 208
   Grad Year: 2025

 Athlete 2:
   ID: 307030
   Name: RASHAAD BINGHAM
   Gender: Boys
   School ID: 208
   Grad Year: 2025



Choose (1 = keep athlete1, 2 = keep athlete2, s = skip, q = quit):  2


Keeping 2nd athlete, merging 1st to 2nd athlete

Merging athlete_id 297497 → 307030

Done. Merged 21 athlete records.
