In [166]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Load elo ratings dataframe
elo_df = pd.read_pickle("../data/processed/elo_ratings.pkl")

In [167]:
elo_df = elo_df.dropna(subset=['date']).copy()


In [168]:
# 1. Compute peak ELO for each fighter
def compute_peak_elo(elo_df, col='elo'):
    peak_elo = (
        elo_df
        .groupby('fighter')[col]
        .max()
        .rename('peak_elo')
    )
    return peak_elo


In [169]:
peak = compute_peak_elo(elo_df, col='elo')
print("Top 15 peak_elo:")
print(peak.sort_values(ascending=False).head(15))

Top 15 peak_elo:
fighter
Anderson Silva              2070.433229
Fedor Emelianenko           2042.702769
Georges St-Pierre           2037.041816
Daniel Cormier              2034.260411
Islam Makhachev             2017.266154
Jon Jones                   2013.623600
Kamaru Usman                1991.105957
Jose Aldo                   1983.683172
Charles Oliveira            1978.090620
Gegard Mousasi              1961.376107
Alexander Volkanovski       1960.241490
Antonio Rodrigo Nogueira    1954.048041
Ryan Bader                  1950.535981
Stipe Miocic                1946.128052
Demetrious Johnson          1944.449019
Name: peak_elo, dtype: float64


In [170]:
# 2. Compute Area Under Elo Curve (AUEC) for each fighter

def compute_auec_elite(df, col='elo', threshold=1600):
    """
    Computes area under the Elo curve, counting only Elo above `threshold`.
    """

    df = df.copy()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # sort for correct integration
    df = df.sort_values(['fighter', 'date'])

    def _auec_group(g):
        if len(g) < 2:
            return 0.0

        g = g.sort_values('date')

        # raw Elo values
        elo_vals = g[col].values.astype(float)

        # convert dates to day counts
        dates = g['date'].values.astype('datetime64[D]').astype('int64')

        # elite-only portion: max(Elo - threshold, 0)
        elite_elo = np.maximum(elo_vals - threshold, 0)

        # time gaps between fights (no clipping)
        dt = np.diff(dates)

        # average elite Elo between fights
        elite_avg = 0.5 * (elite_elo[:-1] + elite_elo[1:])

        # area = sum(elite_avg * dt)
        return (elite_avg * dt).sum()

    # apply fighter-by-fighter
    auec = df.groupby('fighter').apply(_auec_group)

    return auec.rename('auec')


In [171]:
auec = compute_auec_elite(elo_df, col='elo')
print("\nTop 15 AUEC:")
print(auec.sort_values(ascending=False).head(15))


Top 15 AUEC:
fighter
Fedor Emelianenko           2.086856e+06
Jon Jones                   1.667733e+06
Anderson Silva              1.624027e+06
Georges St-Pierre           1.530296e+06
Jose Aldo                   1.495959e+06
Gegard Mousasi              1.426802e+06
Shinya Aoki                 1.369656e+06
Josh Barnett                1.308771e+06
Antonio Rodrigo Nogueira    1.286666e+06
Wanderlei Silva             1.178708e+06
Quinton Jackson             1.176589e+06
Dan Henderson               1.125512e+06
Lyoto Machida               1.118070e+06
Ryan Bader                  1.075716e+06
Fabricio Werdum             1.053782e+06
Name: auec, dtype: float64


  auec = df.groupby('fighter').apply(_auec_group)


In [172]:
# Add opponent pre-fight ELO to each fight record
def add_opponent_elo_pre(elo_df, col='elo_pre'):
    df = elo_df.copy()
    df['date'] = pd.to_datetime(df['date'])

    # Key: (fighter, opponent, date). We'll merge reversed pairs.
    left = df[['fighter', 'opponent', 'date', col]].rename(columns={col: 'elo_pre_fighter'})
    right = df[['fighter', 'opponent', 'date', col]].rename(
        columns={
            'fighter': 'opponent',
            'opponent': 'fighter',
            col: 'elo_pre_opp'
        }
    )

    merged = pd.merge(
        left,
        right,
        on=['fighter', 'opponent', 'date'],
        how='left'
    )

    # Attach back to original via fighter/opponent/date
    df = pd.merge(
        df,
        merged[['fighter', 'opponent', 'date', 'elo_pre_opp']],
        on=['fighter', 'opponent', 'date'],
        how='left'
    )

    return df  # now has df['elo_pre_opp']


In [173]:
elo_df = add_opponent_elo_pre(elo_df, col='elo_pre')

# 4. Compute Strength of Schedule (SoS) for each fighter
def compute_strength_of_schedule(elo_df, opp_col='opp_elo_pre'):
    sos = (
        elo_df
        .groupby('fighter')[opp_col]
        .mean()
        .rename('strength_of_schedule')
    )
    return sos


In [174]:
# 5. Compute Quality-Adjusted Wins (QAW) for each fighter

def compute_quality_adjusted_wins(elo_df, opp_col='elo_pre_opp', baseline=1500):
    wins = elo_df[elo_df['result'] == 'win'].copy()
    qaw = (
        (wins[opp_col] - baseline)
        .groupby(wins['fighter'])
        .sum()
        .rename('quality_adjusted_wins')
    )
    return qaw


In [175]:
qaw = compute_quality_adjusted_wins(elo_df, opp_col='elo_pre_opp', baseline=1500)
print("\nTop 15 quality_adjusted_wins:")
print(qaw.sort_values(ascending=False).head(15))


Top 15 quality_adjusted_wins:
fighter
Georges St-Pierre           4678.180844
Dan Henderson               4037.994857
Lyoto Machida               3631.566908
Jon Jones                   3618.600447
Fabricio Werdum             3477.779405
Antonio Rodrigo Nogueira    3421.677806
Donald Cerrone              3109.822498
Anderson Silva              3088.382364
Robbie Lawler               3036.707295
Mauricio Rua                2976.478978
Fedor Emelianenko           2918.336716
Daniel Cormier              2811.508113
Andrei Arlovski             2738.889071
Max Holloway                2736.250398
Frankie Edgar               2598.244733
Name: quality_adjusted_wins, dtype: float64


In [176]:
elo_df['date'] = pd.to_datetime(elo_df['date'])

fighter_summary = (
    elo_df
    .groupby('fighter')
    .agg(
        n_fights=('result', 'count'),
        n_wins=('result', lambda x: (x == 'win').sum()),
        first_date=('date', 'min'),
        last_date=('date', 'max')
    )
)

fighter_summary['career_years'] = (
    (fighter_summary['last_date'] - fighter_summary['first_date'])
    .dt.days / 365.25
)


In [177]:
# Build Legacy Metrics DataFrame

from scipy.stats import zscore

def build_legacy_metrics(
    elo_df,
    min_fights=10,
    min_career_years=5,
    min_wins=0,
):
    df = elo_df.copy()
    df['date'] = pd.to_datetime(df['date'])

    # --- core metrics ---
    peak = compute_peak_elo(df, col='elo')
    auec = compute_auec_elite(df, col='elo')

    if 'elo_pre_opp' not in df.columns:
        df = add_opponent_elo_pre(df, col='elo_pre')

    sos = compute_strength_of_schedule(df, opp_col='elo_pre_opp')
    qaw = compute_quality_adjusted_wins(df, opp_col='elo_pre_opp', baseline=1500)

    # combine raw metrics
    legacy = pd.concat([peak, auec, sos, qaw], axis=1)

    # --- fighter-level summary for filters ---
    fighter_summary = (
        df.groupby('fighter')
          .agg(
              n_fights=('result', 'count'),
              n_wins=('result', lambda x: (x == 'win').sum()),
              first_date=('date', 'min'),
              last_date=('date', 'max')
          )
    )
    fighter_summary['career_years'] = (
        (fighter_summary['last_date'] - fighter_summary['first_date'])
        .dt.days / 365.25
    )

    # join
    legacy = legacy.join(fighter_summary, how='left')

    # --- apply eligibility filters ---
    mask = (
        (legacy['n_fights'] >= min_fights) &
        (legacy['career_years'] >= min_career_years) &
        (legacy['n_wins'] >= min_wins)
    )
    legacy_elig = legacy.loc[mask].copy()

    # --- recompute z-scores only on eligible fighters ---
    metric_cols = ['peak_elo','auec', 'strength_of_schedule',
                   'quality_adjusted_wins']

    for col in metric_cols:
        legacy_elig[f'z_{col}'] = zscore(
            legacy_elig[col].astype(float),
            nan_policy='omit'
        )

    z_cols = [f'z_{c}' for c in metric_cols]
    legacy_elig['legacy_score'] = legacy_elig[z_cols].sum(axis=1)

    # final sort
    legacy_elig = legacy_elig.sort_values('legacy_score', ascending=False)

    return legacy_elig



In [178]:
legacy_filtered = build_legacy_metrics(
    elo_df,
    min_fights=10,
    min_career_years=5,
    min_wins=0
)

legacy_filtered.head(30)




  auec = df.groupby('fighter').apply(_auec_group)


Unnamed: 0_level_0,peak_elo,auec,strength_of_schedule,quality_adjusted_wins,n_fights,n_wins,first_date,last_date,career_years,z_peak_elo,z_auec,z_strength_of_schedule,z_quality_adjusted_wins,legacy_score
fighter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Fedor Emelianenko,2042.702769,2086856.0,1602.456482,2918.336716,47,40,2000-05-21,2023-02-04,22.707734,4.244909,10.001825,2.155886,2.145149,18.547769
Georges St-Pierre,2037.041816,1530296.0,1681.715489,4678.180844,28,26,2002-01-25,2017-11-04,15.775496,4.18758,7.226038,2.97048,2.802445,17.186543
Jon Jones,2013.6236,1667733.0,1629.154706,3618.600447,29,28,2008-04-12,2024-11-16,16.596851,3.950419,7.911492,2.43028,2.406695,16.698886
Anderson Silva,2070.433229,1624027.0,1604.449434,3088.382364,46,34,1997-06-25,2021-09-24,24.249144,4.525741,7.693516,2.176368,2.20866,16.604285
Jose Aldo,1983.683172,1495959.0,1586.907769,1720.316309,44,34,2004-08-10,2025-05-10,20.747433,3.647206,7.054788,1.996082,1.697692,14.395768
Antonio Rodrigo Nogueira,1954.048041,1286666.0,1630.103589,3421.677806,44,33,1999-06-12,2015-08-01,16.136893,3.347086,6.01096,2.440032,2.333145,14.131223
Dan Henderson,1941.963034,1125512.0,1677.137795,4037.994857,47,32,1997-06-15,2016-10-08,19.315537,3.224699,5.207226,2.923432,2.563338,13.918695
Daniel Cormier,2034.260411,972620.1,1662.904674,2811.508113,25,22,2009-09-25,2020-08-15,10.888433,4.159412,4.444692,2.77715,2.105249,13.486502
Lyoto Machida,1891.771828,1118070.0,1674.553,3631.566908,39,27,2003-05-02,2022-05-13,19.030801,2.716403,5.170109,2.896867,2.411538,13.194917
Josh Barnett,1895.296714,1308771.0,1599.720934,2317.373933,45,37,1997-01-11,2016-09-03,19.644079,2.7521,6.121208,2.127771,1.920691,12.921769


In [179]:
from pathlib import Path

# Top 10 by legacy_score
top10 = legacy_filtered.head(10).copy()

# Bring fighter out of the index 
if top10.index.name == 'fighter':
    top10 = top10.reset_index()

# Add Rank column (1â€“10)
top10['Rank'] = range(1, len(top10) + 1)

# Choose columns: Rank, Fighter, Legacy Score, etc
cols = ['Rank', 'fighter', 'z_peak_elo', 'z_auec', 'z_strength_of_schedule', 'z_quality_adjusted_wins', 'legacy_score'] 
top10 = top10[cols]

# Column labels
top10 = top10.rename(columns={
    'fighter': 'Fighter',
    'z_peak_elo': 'z-Peak Elo',
    'z_auec': 'z-AUEC',
    'z_strength_of_schedule': 'z-SoS',
    'z_quality_adjusted_wins': 'z-QAW',
    'legacy_score': 'Score',
})

numeric_cols = top10.select_dtypes(include='number').columns
top10[numeric_cols] = top10[numeric_cols].round(2)

# Save to markdown
out_path = Path("../outputs/tables/top10_legacy.md")
out_path.parent.mkdir(parents=True, exist_ok=True)

md_table = top10.to_markdown(index=False)
out_path.write_text(md_table, encoding="utf-8")

 


1127

In [181]:
# Peek at top 10 of each
print("Legacy Score (Top 10):")
print(legacy_filtered[['legacy_score']].head(10))

Legacy Score (Top 10):
                          legacy_score
fighter                               
Fedor Emelianenko            18.547769
Georges St-Pierre            17.186543
Jon Jones                    16.698886
Anderson Silva               16.604285
Jose Aldo                    14.395768
Antonio Rodrigo Nogueira     14.131223
Dan Henderson                13.918695
Daniel Cormier               13.486502
Lyoto Machida                13.194917
Josh Barnett                 12.921769
