In [103]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Load elo ratings dataframe
elo_df = pd.read_pickle("../data/processed/elo_ratings.pkl")

In [104]:
elo_df = elo_df.dropna(subset=['date']).copy()


In [105]:
# 1. Compute peak ELO for each fighter
def compute_peak_elo(elo_df, col='elo'):
    peak_elo = (
        elo_df
        .groupby('fighter')[col]
        .max()
        .rename('peak_elo')
    )
    return peak_elo


In [106]:
peak = compute_peak_elo(elo_df, col='elo')
print("Top 15 peak_elo:")
print(peak.sort_values(ascending=False).head(15))

Top 15 peak_elo:
fighter
Fedor Emelianenko           2077.165165
Anderson Silva              2073.807168
Daniel Cormier              2056.812280
Georges St-Pierre           2040.125781
Islam Makhachev             2035.443321
Jon Jones                   2021.886224
Kamaru Usman                1993.506546
José Aldo                   1988.461740
Charles Oliveira            1982.916004
Gegard Mousasi              1968.751702
Stipe Miocic                1967.854349
Antônio Rodrigo Nogueira    1962.842555
Alexander Volkanovski       1962.726177
Ryan Bader                  1960.346527
Demetrious Johnson          1946.358127
Name: peak_elo, dtype: float64


In [107]:
# 2. Compute Area Under Elo Curve (AUEC) for each fighter

def compute_auec_elite(df, col='elo', threshold=1600):
    """
    Computes area under the Elo curve, counting only Elo above `threshold`.
    """

    df = df.copy()
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # sort for correct integration
    df = df.sort_values(['fighter', 'date'])

    def _auec_group(g):
        if len(g) < 2:
            return 0.0

        g = g.sort_values('date')

        # raw Elo values
        elo_vals = g[col].values.astype(float)

        # convert dates to day counts
        dates = g['date'].values.astype('datetime64[D]').astype('int64')

        # elite-only portion: max(Elo - threshold, 0)
        elite_elo = np.maximum(elo_vals - threshold, 0)

        # time gaps between fights (no clipping)
        dt = np.diff(dates)

        # average elite Elo between fights
        elite_avg = 0.5 * (elite_elo[:-1] + elite_elo[1:])

        # area = sum(elite_avg * dt)
        return (elite_avg * dt).sum()

    # apply fighter-by-fighter
    auec = df.groupby('fighter').apply(_auec_group)

    return auec.rename('auec')


In [108]:
auec = compute_auec_elite(elo_df, col='elo')
print("\nTop 15 AUEC:")
print(auec.sort_values(ascending=False).head(15))


Top 15 AUEC:
fighter
Fedor Emelianenko           2.146039e+06
Jon Jones                   1.699193e+06
Anderson Silva              1.648552e+06
Georges St-Pierre           1.543359e+06
José Aldo                   1.526540e+06
Gegard Mousasi              1.508794e+06
Shinya Aoki                 1.424938e+06
Antônio Rodrigo Nogueira    1.371285e+06
Quinton Jackson             1.201365e+06
Josh Barnett                1.199558e+06
Ryan Bader                  1.146413e+06
Dan Henderson               1.139428e+06
Lyoto Machida               1.136476e+06
Fabrício Werdum             1.121335e+06
Wanderlei Silva             1.109477e+06
Name: auec, dtype: float64


  auec = df.groupby('fighter').apply(_auec_group)


In [109]:
# Add opponent pre-fight ELO to each fight record
def add_opponent_elo_pre(elo_df, col='elo_pre'):
    df = elo_df.copy()
    df['date'] = pd.to_datetime(df['date'])

    # Key: (fighter, opponent, date). We'll merge reversed pairs.
    left = df[['fighter', 'opponent', 'date', col]].rename(columns={col: 'elo_pre_fighter'})
    right = df[['fighter', 'opponent', 'date', col]].rename(
        columns={
            'fighter': 'opponent',
            'opponent': 'fighter',
            col: 'elo_pre_opp'
        }
    )

    merged = pd.merge(
        left,
        right,
        on=['fighter', 'opponent', 'date'],
        how='left'
    )

    # Attach back to original via fighter/opponent/date
    df = pd.merge(
        df,
        merged[['fighter', 'opponent', 'date', 'elo_pre_opp']],
        on=['fighter', 'opponent', 'date'],
        how='left'
    )

    return df  # now has df['elo_pre_opp']


In [110]:
elo_df = add_opponent_elo_pre(elo_df, col='elo_pre')

# 4. Compute Strength of Schedule (SoS) for each fighter
def compute_strength_of_schedule(elo_df, opp_col='opp_elo_pre'):
    sos = (
        elo_df
        .groupby('fighter')[opp_col]
        .mean()
        .rename('strength_of_schedule')
    )
    return sos


In [111]:
fighter_summary = (
    elo_df.groupby('fighter')
        .agg(n_fights=('result', 'count'))
)

min_fights = 10  # adjust to taste

eligible_fighters = fighter_summary.query("n_fights >= @min_fights").index

sos_filtered = sos.loc[eligible_fighters]

print(sos_filtered.sort_values(ascending=False).head(15))



fighter
Mark Hunt            1700.478797
Georges St-Pierre    1685.822926
B.J. Penn            1682.087725
Dan Henderson        1680.101196
Lyoto Machida        1678.598724
Daniel Cormier       1674.050214
Fabrício Werdum      1673.316526
Douglas Lima         1672.827286
Chris Weidman        1666.510812
Cain Velasquez       1664.603441
Frankie Edgar        1657.516057
Tatsuya Mizuno       1656.654945
Vitor Belfort        1647.299791
Tyron Woodley        1646.535695
Patricky Pitbull     1643.195332
Name: strength_of_schedule, dtype: float64


In [112]:
sos = compute_strength_of_schedule(elo_df, opp_col='elo_pre_opp')
print("\nTop 15 strength_of_schedule:")
print(sos.sort_values(ascending=False).head(15))


Top 15 strength_of_schedule:
fighter
Wagner Martins     1925.704787
Danny Kingad       1897.682829
Yuki Yamamoto      1896.320088
Yuya Wakamatsu     1893.390552
Arnaud Lepont      1851.421459
Mark Boyer         1848.461528
AJ Broer           1840.428436
Yokthai Sithoar    1838.761885
Kirk Nielsen       1837.898005
Jessie Garcia      1835.359907
Rodney Arp         1832.814010
Bob Breshears      1831.061769
Bryan Ewhers       1830.673083
Steve Fiscus       1829.287540
Jason Roszell      1824.729681
Name: strength_of_schedule, dtype: float64


In [113]:
# 5. Compute Quality-Adjusted Wins (QAW) for each fighter

def compute_quality_adjusted_wins(elo_df, opp_col='elo_pre_opp', baseline=1500):
    wins = elo_df[elo_df['result'] == 'win'].copy()
    qaw = (
        (wins[opp_col] - baseline)
        .groupby(wins['fighter'])
        .sum()
        .rename('quality_adjusted_wins')
    )
    return qaw


In [114]:
qaw = compute_quality_adjusted_wins(elo_df, opp_col='elo_pre_opp', baseline=1500)
print("\nTop 15 quality_adjusted_wins:")
print(qaw.sort_values(ascending=False).head(15))


Top 15 quality_adjusted_wins:
fighter
Georges St-Pierre           4744.791858
Dan Henderson               4270.612496
Fabrício Werdum             3906.224332
Jon Jones                   3840.711266
Lyoto Machida               3726.845187
Fedor Emelianenko           3551.367339
Antônio Rodrigo Nogueira    3533.043059
Daniel Cormier              3213.702439
Donald Cerrone              3196.750658
Anderson Silva              3137.050928
Andrei Arlovski             3099.138274
Robbie Lawler               3027.150068
Mark Hunt                   3000.335839
Frankie Edgar               2851.418519
Max Holloway                2832.895494
Name: quality_adjusted_wins, dtype: float64


In [115]:
elo_df['date'] = pd.to_datetime(elo_df['date'])

fighter_summary = (
    elo_df
    .groupby('fighter')
    .agg(
        n_fights=('result', 'count'),
        n_wins=('result', lambda x: (x == 'win').sum()),
        first_date=('date', 'min'),
        last_date=('date', 'max')
    )
)

fighter_summary['career_years'] = (
    (fighter_summary['last_date'] - fighter_summary['first_date'])
    .dt.days / 365.25
)


In [None]:
# Build Legacy Metrics DataFrame

from scipy.stats import zscore

def build_legacy_metrics(
    elo_df,
    min_fights=10,
    min_career_years=5,
    min_wins=0,
):
    df = elo_df.copy()
    df['date'] = pd.to_datetime(df['date'])

    # --- core metrics ---
    peak = compute_peak_elo(df, col='elo')
    auec = compute_auec_elite(df, col='elo')

    if 'elo_pre_opp' not in df.columns:
        df = add_opponent_elo_pre(df, col='elo_pre')

    sos = compute_strength_of_schedule(df, opp_col='elo_pre_opp')
    qaw = compute_quality_adjusted_wins(df, opp_col='elo_pre_opp', baseline=1500)

    # combine raw metrics
    legacy = pd.concat([peak, auec, sos, qaw], axis=1)

    # --- fighter-level summary for filters ---
    fighter_summary = (
        df.groupby('fighter')
          .agg(
              n_fights=('result', 'count'),
              n_wins=('result', lambda x: (x == 'win').sum()),
              first_date=('date', 'min'),
              last_date=('date', 'max')
          )
    )
    fighter_summary['career_years'] = (
        (fighter_summary['last_date'] - fighter_summary['first_date'])
        .dt.days / 365.25
    )

    # join
    legacy = legacy.join(fighter_summary, how='left')

    # --- apply eligibility filters ---
    mask = (
        (legacy['n_fights'] >= min_fights) &
        (legacy['career_years'] >= min_career_years) &
        (legacy['n_wins'] >= min_wins)
    )
    legacy_elig = legacy.loc[mask].copy()

    # --- recompute z-scores only on eligible fighters ---
    metric_cols = ['peak_elo','auec', 'strength_of_schedule',
                   'quality_adjusted_wins']

    for col in metric_cols:
        legacy_elig[f'z_{col}'] = zscore(
            legacy_elig[col].astype(float),
            nan_policy='omit'
        )

    z_cols = [f'z_{c}' for c in metric_cols]
    legacy_elig['legacy_score'] = legacy_elig[z_cols].sum(axis=1)

    # final sort
    legacy_elig = legacy_elig.sort_values('legacy_score', ascending=False)

    return legacy_elig



In [141]:
legacy_filtered = build_legacy_metrics(
    elo_df,
    min_fights=10,
    min_career_years=5,
    min_wins=0
)

legacy_filtered.head(30)




  auec = df.groupby('fighter').apply(_auec_group)


Unnamed: 0_level_0,peak_elo,auec,strength_of_schedule,quality_adjusted_wins,n_fights,n_wins,first_date,last_date,career_years,z_peak_elo,z_auec,z_strength_of_schedule,z_quality_adjusted_wins,legacy_score
fighter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Fedor Emelianenko,2077.165165,2146039.0,1610.617648,3551.367339,53,44,2000-05-21,2023-02-04,22.707734,4.500431,10.034048,2.206706,2.373012,19.114197
Georges St-Pierre,2040.125781,1543359.0,1685.822926,4744.791858,28,26,2002-01-25,2017-11-04,15.775496,4.131605,7.101646,2.976523,2.818742,17.028516
Jon Jones,2021.886224,1699193.0,1636.837657,3840.711266,29,28,2008-04-12,2024-11-16,16.596851,3.949982,7.859872,2.4751,2.481078,16.766032
Anderson Silva,2073.807168,1648552.0,1603.724158,3137.050928,47,35,1997-06-25,2021-09-24,24.249144,4.466993,7.613472,2.136143,2.218269,16.434877
Antônio Rodrigo Nogueira,1962.842555,1371285.0,1623.498186,3533.043059,48,36,1999-06-12,2015-08-01,16.136893,3.362045,6.2644,2.338554,2.366168,14.331168
Daniel Cormier,2056.81228,1068131.0,1674.050214,3213.702439,26,23,2009-09-25,2020-08-15,10.888433,4.297764,4.789373,2.856015,2.246898,14.19005
José Aldo,1988.46174,1526540.0,1582.159921,1567.287209,45,35,2004-08-10,2025-05-10,20.747433,3.617152,7.01981,1.915407,1.631981,14.184351
Dan Henderson,1942.771928,1139428.0,1680.101196,4270.612496,48,33,1997-06-15,2016-10-08,19.315537,3.162189,5.136276,2.917954,2.641641,13.858061
Fabrício Werdum,1927.547536,1121335.0,1673.316526,3906.224332,37,25,2002-06-16,2023-09-08,21.229295,3.01059,5.048243,2.848505,2.505546,13.412884
Lyoto Machida,1893.368227,1136476.0,1678.598724,3726.845187,39,27,2003-05-02,2022-05-13,19.030801,2.670244,5.121909,2.902575,2.43855,13.133279


In [None]:
from pathlib import Path

# Top 10 by legacy_score
top10 = legacy_filtered.head(10).copy()

# Bring fighter out of the index 
if top10.index.name == 'fighter':
    top10 = top10.reset_index()

# Add Rank column (1–10)
top10['Rank'] = range(1, len(top10) + 1)

# Choose columns: Rank, Fighter, Legacy Score, etc
cols = ['Rank', 'fighter', 'z_peak_elo', 'z_auec', 'z_strength_of_schedule', 'z_quality_adjusted_wins', 'legacy_score'] 
top10 = top10[cols]

# Column labels
top10 = top10.rename(columns={
    'fighter': 'Fighter',
    'z_peak_elo': 'z-Peak Elo',
    'z_auec': 'z-AUEC',
    'z_strength_of_schedule': 'z-SoS',
    'z_quality_adjusted_wins': 'z-QAW',
    'legacy_score': 'Score',
})

numeric_cols = top10.select_dtypes(include='number').columns
top10[numeric_cols] = top10[numeric_cols].round(2)

# Save to markdown
out_path = Path("../outputs/tables/top10_legacy.md")
out_path.parent.mkdir(parents=True, exist_ok=True)

md_table = top10.to_markdown(index=False)
out_path.write_text(md_table, encoding="utf-8")

 


1127

In [148]:
# Start from the unadjusted legacy results
legacy_unadj = legacy_filtered.copy()

# Define weights for the z-scores
weights = {
    'z_peak_elo': 1.0,
    'z_auec': 0.8,
    'z_strength_of_schedule': 0.7,
    'z_quality_adjusted_wins': 0.7
}

# Compute weighted legacy score
legacy_unadj['legacy_score_weighted'] = sum(
    legacy_unadj[col] * w for col, w in weights.items()
)

# Sort two different views:

# 1) Unweighted (your original)
legacy_unweighted_sorted = legacy_unadj.sort_values('legacy_score', ascending=False)

# 2) Weighted (adjusted)
legacy_weighted_sorted = legacy_unadj.sort_values('legacy_score_weighted', ascending=False)

# Peek at top 10 of each
print("Unweighted Legacy Score (Top 10):")
print(legacy_unweighted_sorted[['legacy_score']].head(10))

print("\nWeighted Legacy Score (Top 10):")
print(legacy_weighted_sorted[['legacy_score_weighted']].head(10))


Unweighted Legacy Score (Top 10):
                          legacy_score
fighter                               
Fedor Emelianenko            19.114197
Georges St-Pierre            17.028516
Jon Jones                    16.766032
Anderson Silva               16.434877
Antônio Rodrigo Nogueira     14.331168
Daniel Cormier               14.190050
José Aldo                    14.184351
Dan Henderson                13.858061
Fabrício Werdum              13.412884
Lyoto Machida                13.133279

Weighted Legacy Score (Top 10):
                          legacy_score_weighted
fighter                                        
Fedor Emelianenko                     15.733472
Georges St-Pierre                     13.869608
Jon Jones                             13.707204
Anderson Silva                        13.605859
José Aldo                             11.716172
Daniel Cormier                        11.701301
Antônio Rodrigo Nogueira              11.666871
Dan Henderson                    