In [1]:
# Import packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict


# 1) load fights
fights_path = "../data/processed/fight_history.csv"
fights = pd.read_csv(fights_path, parse_dates=["date"])

# 2) load fighter list
fighters_path = "../data/processed/fighter_list.csv"
fighters = pd.read_csv(fighters_path)


In [2]:
# make bouts dataframe that has one row per bout
def make_bout_table(fights: pd.DataFrame) -> pd.DataFrame:
    df = fights.copy()
    df["date"] = pd.to_datetime(df["date"])

    # Even sides so A-vs-B == B-vs-A
    df["fighter_a"] = df[["fighter", "opponent"]].min(axis=1)
    df["fighter_b"] = df[["fighter", "opponent"]].max(axis=1)
    df["fight_key"] = list(zip(df["fighter_a"], df["fighter_b"], df["date"].dt.date))

    # Map text result to score from the row's fighter perspective
    def score_from_text(x: str):
        x = str(x).strip().lower()
        if x == "win":  return 1.0
        if x == "loss": return 0.0
        if x == "draw": return 0.5
        return None  # NC/unknown - should produce no results as NC is dropped earlier

    df["score_row_fighter"] = df["result"].map(score_from_text)

    rows = []
    for _, g in df.groupby("fight_key", sort=False):
        g = g.sort_values("date")
        row = g.iloc[0]

        sa = row["score_row_fighter"]
        if pd.isna(sa):
            continue  # skip NC/unknown - should produce no results as NC is dropped earlier

        # Make score from fighter_a’s perspective
        if row["fighter"] != row["fighter_a"]:
            sa = 1.0 - sa
        sb = 1.0 - sa if sa in (0.0, 1.0) else 0.5

        # Bout-level flags (any True across rows)
        finish_flag = bool(g["finish"].any()) if "finish" in g else False
        high_tier_title_flag  = bool(g["high_tier_championship_fight"].any()) if "high_tier_championship_fight" in g else False
        low_tier_title_flag  = bool(g["low_tier_championship_fight"].any()) if "low_tier_championship_fight" in g else False

        rows.append({
            "date": row["date"],
            "fighter_a": row["fighter_a"],
            "fighter_b": row["fighter_b"],
            "sa": sa,
            "sb": sb,
            "is_finish": finish_flag,
            "is_high_tier_title": high_tier_title_flag,
            "is_low_tier_title": low_tier_title_flag
        })

    bouts = pd.DataFrame(rows).sort_values("date").reset_index(drop=True)
    return bouts




In [3]:
bouts = make_bout_table(fights)

# Count career fights from the bouts table. 
career_counts = (
    pd.concat([bouts["fighter_a"], bouts["fighter_b"]])
      .value_counts()
      .to_dict()
)

# Map: fighter -> has_wiki_page (from master list)
has_page_dict = fighters.set_index("Fighter")["has_wiki_page"].to_dict()

In [4]:
# 6) Elo helpers + loop
def expected(ra, rb):
    return 1 / (1 + 10 ** ((rb - ra) / 400))

def _decay_toward_floor(elo0: float, days: int, floor: float, half_life_days: float) -> float:
    if days <= 0:
        return elo0
    if elo0 <= floor:
        return elo0  # never lift via inactivity
    gap0 = elo0 - floor
    factor = 0.5 ** (days / float(half_life_days))  # == exp(-ln(2)*days/HL)
    return floor + gap0 * factor

def run_elo(
    bouts: pd.DataFrame,
    career_counts: dict,                 # <-- total career fights per fighter used to set priors and avoid stat padding
    #baseline_main: float = 1500.0,
    #baseline_low: float = 1000.0,        # prior for low-career-count fighters
    k: float = 60.0,
    half_life_days: float = 3650.0,
    floor: float = 1500.0,
    bonus_finish: float = 2.0,
    bonus_high_tier_title: float = 6.0,
    bonus_low_tier_title: float = 3.0,
    grace_days: int = 0,
    min_reliable_fights: int = 5         # for K-scaling by opponent reliability
) -> pd.DataFrame:

    elo = {}                     # explicit init so we choose priors ourselves
    last_date = {}
    fights_seen = defaultdict(int)   # opponent reliability ONLY (not priors)
    logs = []

    def get_or_init_elo(fid: str):
        """Set informative prior that scales linearly based on 2 conditions:
         1. Has >= 10 fights - 1500 baseline elo
         2. Has wiki page but < 10 fights - scale linearly from 1300 to 1500 baseline elo. Rationale: having a wiki page indicates some notability/reliability even with few fights.
         3. Has no wiki page and < 10 fights - scale linearly from 1100 to 1500 baseline elo"""
        if fid in elo:
            return elo[fid]
        total = int(career_counts.get(fid, 0))
        has_page = bool(has_page_dict.get(fid, False))
        prior = 1500 - min(10, max(0, 10 - total)) * 40   # linear from 1100 (0 fights) to 1500 (10+ fights)
        if has_page and total < 10:
            prior = 1500 - min(10, max(0, 10 - total)) * 20   # linear from 1300 (0 fights) to 1500 (10+ fights)  
        #prior = baseline_main if (has_page or total >= 10) else baseline_low
        elo[fid] = prior
        return prior

    def k_scale_vs_opponent(opponent_id: str) -> float:
        """Scale K by opponent reliability (career number of fights)."""
        n = fights_seen[opponent_id]
        rel = max(0.0, min(1.0, n / float(min_reliable_fights)))
        return 0.5 + 0.5 * rel   # 0.5*k for brand-new opp, up to 1.0*k when reliable

    bouts = bouts.sort_values("date").reset_index(drop=True)

    for _, r in bouts.iterrows():
        a, b, d = r["fighter_a"], r["fighter_b"], r["date"]
        sa, sb = float(r["sa"]), float(r["sb"])
        is_finish, is_high_tier_title, is_low_tier_title = bool(r["is_finish"]), bool(r["is_high_tier_title"]), bool(r["is_low_tier_title"])

        # Initialize last_date for first appearances (avoid pre-career decay)
        if a not in last_date: last_date[a] = d
        if b not in last_date: last_date[b] = d

        # Ensure Elo exists using CAREER-BASED prior
        ra = get_or_init_elo(a)
        rb = get_or_init_elo(b)

        # Inactivity decay with grace period
        for f in (a, b):
            days = (d - last_date[f]).days
            effective_days = max(0, days - max(0, grace_days))
            elo[f] = _decay_toward_floor(elo[f], effective_days, floor=floor, half_life_days=half_life_days)

        # Pre-fight ratings after decay
        ra, rb = elo[a], elo[b]
        ea = expected(ra, rb); eb = 1.0 - ea

        # K scaling by opponent reliability
        k_a = k * k_scale_vs_opponent(b)
        k_b = k * k_scale_vs_opponent(a)

        # Winner-only bonuses
        bonus_a = ((bonus_finish if is_finish else 0.0) + (bonus_high_tier_title if is_high_tier_title else 0.0) + (bonus_low_tier_title if is_low_tier_title else 0.0)) if sa == 1.0 else 0.0
        bonus_b = ((bonus_finish if is_finish else 0.0) + (bonus_high_tier_title if is_high_tier_title else 0.0) + (bonus_low_tier_title if is_low_tier_title else 0.0)) if sb == 1.0 else 0.0
        
        # Elo updates
        ra_new = ra + k_a * (sa - ea) + bonus_a
        rb_new = rb + k_b * (sb - eb) + bonus_b

        # Persist & advance state
        elo[a], elo[b] = ra_new, rb_new
        last_date[a] = last_date[b] = d
        fights_seen[a] += 1
        fights_seen[b] += 1

        logs += [
    {
        "date": d, "fighter": a, "opponent": b,
        "elo_pre": ra, "elo": ra_new, "score": sa,
        "result": ("win" if sa == 1.0 else "loss" if sa == 0.0 else "draw"),
        "is_finish": is_finish, "is_high_tier_title": is_high_tier_title, "is_low_tier_title": is_low_tier_title, "bonus": bonus_a,
        "k_eff": k_a, "opp_fights_seen": fights_seen[b],
    },
    {
        "date": d, "fighter": b, "opponent": a,
        "elo_pre": rb, "elo": rb_new, "score": sb,
        "result": ("win" if sb == 1.0 else "loss" if sb == 0.0 else "draw"),
        "is_finish": is_finish, "is_high_tier_title": is_high_tier_title, "is_low_tier_title": is_low_tier_title, "bonus": bonus_b,
        "k_eff": k_b, "opp_fights_seen": fights_seen[a],
    },
]

    return pd.DataFrame(logs).sort_values("date").reset_index(drop=True)


In [5]:
# 7) Run Elo
elo_logs = run_elo(
    bouts=bouts,
    career_counts=career_counts, 
    k=60,
    half_life_days=1825,
    floor=1500,
    bonus_finish=2,
    bonus_high_tier_title=6,
    bonus_low_tier_title=2,
    grace_days=270,
    min_reliable_fights=5
)

# check
print(len(bouts), "bouts  ->  ", len(elo_logs), "elo rows (should be 2x bouts)")
display(elo_logs.head())


44788 bouts  ->   89576 elo rows (should be 2x bouts)


Unnamed: 0,date,fighter,opponent,elo_pre,elo,score,result,is_finish,is_high_tier_title,is_low_tier_title,bonus,k_eff,opp_fights_seen
0,1980-04-25,Rei Zulu,Rickson Gracie,1260.0,1253.9772,0.0,loss,True,False,False,0.0,30.0,1
1,1980-04-25,Rickson Gracie,Rei Zulu,1500.0,1508.0228,1.0,win,True,False,False,2.0,30.0,1
2,1984-01-01,Rei Zulu,Rickson Gracie,1253.9772,1247.120013,0.0,loss,True,False,False,0.0,36.0,2
3,1984-01-01,Rickson Gracie,Rei Zulu,1505.331409,1514.188595,1.0,win,True,False,False,2.0,36.0,2
4,1984-11-30,Eugenio Tadeu,Renan Pitanguy,1400.0,1407.487635,1.0,win,True,False,False,2.0,30.0,1


In [6]:
peak_elo = (
    elo_logs.groupby("fighter")["elo"]
    .max()
    .sort_values(ascending=False)
    .round(2)
    .rename("max_elo")
)

print(peak_elo.head(30))

fighter
Fedor Emelianenko           2077.17
Anderson Silva              2073.81
Daniel Cormier              2056.81
Georges St-Pierre           2040.13
Islam Makhachev             2035.44
Jon Jones                   2021.89
Kamaru Usman                1993.51
José Aldo                   1988.46
Charles Oliveira            1982.92
Gegard Mousasi              1968.75
Stipe Miocic                1967.85
Antônio Rodrigo Nogueira    1962.84
Alexander Volkanovski       1962.73
Ryan Bader                  1960.35
Demetrious Johnson          1946.36
Dan Henderson               1942.77
Donald Cerrone              1941.34
Israel Adesanya             1940.36
Max Holloway                1937.65
Wanderlei Silva             1936.89
Dustin Poirier              1936.61
Robbie Lawler               1932.88
Matt Hughes                 1931.98
Chuck Liddell               1929.73
Fabrício Werdum             1927.55
Benson Henderson            1924.99
Cain Velasquez              1922.82
Valentina Shevchenko

In [7]:
# Save elo ratings df to pickle
elo_logs.to_pickle("../data/processed/elo_ratings.pkl")