In [3]:
# ============================================================
# CHUNK 1 ‚Äî UFC EVENTS SCRAPER
# Scrapes ALL completed UFC event URLs from UFCStats
# Outputs: events.csv
# ============================================================

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time, os, re

# ------------------------------------------------------------
# Output folder
# ------------------------------------------------------------
BASE_DIR = "/Users/shrey24/Desktop/ufc-s-tier"
os.makedirs(BASE_DIR, exist_ok=True)

# ------------------------------------------------------------
# Helper: fetch & parse
# ------------------------------------------------------------
def get_soup(url):
    time.sleep(0.10)
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    if r.status_code != 200:
        print("FAILED:", url)
    return BeautifulSoup(r.text, "html.parser")

# ------------------------------------------------------------
# 1. UFCStats completed events page
# ------------------------------------------------------------
events_url = "http://www.ufcstats.com/statistics/events/completed?page=all"
soup = get_soup(events_url)

# All event URL links
event_links = [a["href"] for a in soup.select("a.b-link.b-link_style_black")]
event_names = [a.text.strip() for a in soup.select("a.b-link.b-link_style_black")]

print(f"Found {len(event_links)} UFC events.")

# ------------------------------------------------------------
# Extract event_id (UUID-like string at end of URL)
# ------------------------------------------------------------
def extract_event_id(url):
    m = re.search(r"/event-details/([a-zA-Z0-9\-]+)", url)
    return m.group(1) if m else None

event_ids = [extract_event_id(u) for u in event_links]

# ------------------------------------------------------------
# Save CSV
# ------------------------------------------------------------
df_events = pd.DataFrame({
    "event_id": event_ids,
    "event_name": event_names,
    "event_url": event_links
})

df_events.to_csv(f"{BASE_DIR}/events.csv", index=False)

df_events.head(), print("Saved:", f"{BASE_DIR}/events.csv")


Found 756 UFC events.
Saved: /Users/shrey24/Desktop/ufc-s-tier/events.csv


(           event_id                              event_name  \
 0  bd92cf5da5413d2a          UFC 323: Dvalishvili vs. Yan 2   
 1  92c96df8bdab5fea   UFC Fight Night: Tsarukyan vs. Hooker   
 2  8db1b36dde268ef6  UFC 322: Della Maddalena vs. Makhachev   
 3  6436029b50a9c255       UFC Fight Night: Bonfim vs. Brown   
 4  0e2c2daf11b5d8f2       UFC Fight Night: Garcia vs. Onama   
 
                                            event_url  
 0  http://www.ufcstats.com/event-details/bd92cf5d...  
 1  http://www.ufcstats.com/event-details/92c96df8...  
 2  http://www.ufcstats.com/event-details/8db1b36d...  
 3  http://www.ufcstats.com/event-details/6436029b...  
 4  http://www.ufcstats.com/event-details/0e2c2daf...  ,
 None)

In [6]:
# ============================================================
# CHUNK 2 ‚Äî UFC FIGHTS SCRAPER (FULLY CLEANED)
# Works on UFCStats NEWER TABLE FORMAT
# ============================================================

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time, os

BASE_DIR = "/Users/shrey24/Desktop/ufc-s-tier"
events_path = f"{BASE_DIR}/events.csv"

df_events = pd.read_csv(events_path)
print("Loaded events:", len(df_events))


# ------------------------------------------------------------
# Cleaning Helper ‚Äî removes ALL \n and joins numbers properly
# ------------------------------------------------------------
def cell(td):
    """Extract clean text from any UFCStats stat cell."""
    return td.get_text(" ", strip=True) if td else None


# ------------------------------------------------------------
# Helper to fetch soup
# ------------------------------------------------------------
def get_soup(url):
    time.sleep(0.20)
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    if r.status_code != 200:
        print("FAILED:", url)
    return BeautifulSoup(r.text, "html.parser")


# ------------------------------------------------------------
# STORAGE
# ------------------------------------------------------------
all_fights = []


# ------------------------------------------------------------
# SCRAPE EACH EVENT PAGE
# ------------------------------------------------------------
for idx, row in df_events.iterrows():

    event_url  = row["event_url"]
    event_id   = row["event_id"]
    event_name = row["event_name"]

    soup = get_soup(event_url)

    table_rows = soup.select("tbody tr")
    fight_order = 1

    for tr in table_rows:
        cols = tr.find_all("td")
        if len(cols) < 10:
            continue

        # --------------------------------------
        # Column 1 ‚Üí Fighters + Fight URL
        # --------------------------------------
        fighter_links = cols[1].select("a")

        if len(fighter_links) < 2:
            continue

        fighter_A = fighter_links[0].get_text(strip=True)
        fighter_B = fighter_links[1].get_text(strip=True)

        # FIGHT URL ‚Üí Actually located in the FIRST <a> of the row (bout link)
        fight_url = cols[0].select_one("a")["href"]

        # --------------------------------------
        # Column 0 ‚Üí W/L Label
        # --------------------------------------
        WL_label = cell(cols[0])

        # --------------------------------------
        # Columns 2‚Äì5 ‚Üí KD / STR / TD / SUB
        # Cleaned using cell()
        # --------------------------------------
        KD  = cell(cols[2])
        STR = cell(cols[3])
        TD  = cell(cols[4])
        SUB = cell(cols[5])

        # --------------------------------------
        # Columns 6‚Äì9 ‚Üí Weight Class / Method / Round End / Time End
        # --------------------------------------
        weight_class = cell(cols[6])
        method       = cell(cols[7])
        round_end    = cell(cols[8])
        time_end     = cell(cols[9])

        # --------------------------------------
        # Scheduled Rounds (3 or 5)
        # --------------------------------------
        if "5" in round_end:
            scheduled_rounds = 5
        else:
            scheduled_rounds = 3

        # --------------------------------------
        # Save fight record
        # --------------------------------------
        all_fights.append({
            "event_id": event_id,
            "event_name": event_name,
            "event_url": event_url,

            "fight_order": fight_order,
            "fight_url": fight_url,

            "fighter_A": fighter_A,
            "fighter_B": fighter_B,

            "WL_label": WL_label,

            "KD": KD,
            "STR": STR,
            "TD": TD,
            "SUB": SUB,

            "weight_class": weight_class,
            "method": method,
            "round_end": round_end,
            "time_end": time_end,

            "scheduled_rounds": scheduled_rounds
        })

        fight_order += 1

    # Progress update every 50 events
    if idx % 50 == 0:
        print(f"Processed {idx}/{len(df_events)} events...")


# ------------------------------------------------------------
# SAVE OUTPUT
# ------------------------------------------------------------
df_fights = pd.DataFrame(all_fights)
df_fights.to_csv(f"{BASE_DIR}/fights.csv", index=False)

print("Saved fights.csv:", df_fights.shape)
df_fights.head()


Loaded events: 756
Processed 0/756 events...
Processed 50/756 events...
Processed 100/756 events...
Processed 150/756 events...
Processed 200/756 events...
Processed 250/756 events...
Processed 300/756 events...
Processed 350/756 events...
Processed 400/756 events...
Processed 450/756 events...
Processed 500/756 events...
Processed 550/756 events...
Processed 600/756 events...
Processed 650/756 events...
Processed 700/756 events...
Processed 750/756 events...
Saved fights.csv: (8482, 17)


Unnamed: 0,event_id,event_name,event_url,fight_order,fight_url,fighter_A,fighter_B,WL_label,KD,STR,TD,SUB,weight_class,method,round_end,time_end,scheduled_rounds
0,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,1,http://www.ufcstats.com/fight-details/4a0db214...,Petr Yan,Merab Dvalishvili,win,0 0,139 134,5 2,0 2,Bantamweight,U-DEC,5,5:00,5
1,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,2,http://www.ufcstats.com/fight-details/dfa692db...,Joshua Van,Alexandre Pantoja,win,0 0,2 6,0 0,0 0,Flyweight,KO/TKO,1,0:26,3
2,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,3,http://www.ufcstats.com/fight-details/fbbb9e72...,Tatsuro Taira,Brandon Moreno,win,0 0,28 9,1 0,0 1,Flyweight,KO/TKO Punches,2,2:24,3
3,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,4,http://www.ufcstats.com/fight-details/1dc29f4c...,Payton Talbott,Henry Cejudo,win,1 0,134 60,3 1,0 0,Bantamweight,U-DEC,3,5:00,3
4,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,5,http://www.ufcstats.com/fight-details/6d6ab10c...,Jan Blachowicz,Bogdan Guskov,draw draw,1 1,84 80,0 0,1 0,Light Heavyweight,M-DEC,3,5:00,3


In [20]:
import requests
from bs4 import BeautifulSoup

url = "http://www.ufcstats.com/fight-details/4a0db214d9721d6e"

html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}).text

print(html[:2500])  # first 2500 chars


<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js ie8 lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <title>
    Stats | UFC
  </title>
  <meta name="description" content="">
  <meta name="viewport" content="">
  <link rel="stylesheet" href="/blocks/main.css?ver=669261">
  <script src="/js/vendor/modernizr-2.6.2.min.js"></script>
  <script>
    (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
    m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
    })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

    ga('create', 'UA-2855164-1', 'auto');
  

In [None]:
# ============================================================
# CHUNK 3‚Äî BULK SCRAPER (NO METADATA, FIXED TOTALS + ROUNDS)
# ============================================================

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time, random, re

BASE_DIR = "/Users/shrey24/Desktop/ufc-s-tier"
df_fights = pd.read_csv(f"{BASE_DIR}/fights.csv")
print("Loaded fights:", len(df_fights))

# ------------------------------------------------------------
# REQUEST + HELPERS
# ------------------------------------------------------------
UA_LIST = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (X11; Linux x86_64)",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
]

def get_soup(url, retries=5):
    for _ in range(retries):
        ua = {"User-Agent": random.choice(UA_LIST)}
        r = requests.get(url, headers=ua)
        if "<h1>Internal Server Error" not in r.text:
            return BeautifulSoup(r.text, "html.parser")
        time.sleep(random.uniform(4, 9))
    print("‚ùå Failed permanently:", url)
    return None

def txt(x):
    return x.get_text(" ", strip=True) if x else None

# ------------------------------------------------------------
# SCRAPE LOOP
# ------------------------------------------------------------
totals_rows = []

for idx, row in df_fights.iterrows():

    fight_url = row["fight_url"]
    soup = get_soup(fight_url)
    if soup is None:
        continue

    # --------------------------------------------------------
    # FIGHTERS
    # --------------------------------------------------------
    fighters = soup.select("h3.b-fight-details__person-name a")
    if len(fighters) < 2:
        print("‚ùå No fighters at", fight_url)
        continue

    fighter_A = txt(fighters[0])
    fighter_B = txt(fighters[1])

    # --------------------------------------------------------
    # TOTALS TABLE (bulletproof)
    # --------------------------------------------------------
    totals_header = soup.find("p", string=re.compile(r"Totals", re.I))
    if not totals_header:
        totals_header = soup.find("p", class_=re.compile("collapse-link_tot", re.I))

    if totals_header:
        totals_table = totals_header.find_next("table")
    else:
        totals_table = soup.select_one("table.b-fight-details__table")

    if totals_table is None:
        print("‚ùå No Totals table:", fight_url)
        continue

    rows = totals_table.select("tbody tr")
    stats = {}

    labels = [
        "KD", "Sig Str", "Sig Str %", "Total Str",
        "Td", "Td %", "Sub Att", "Rev", "Ctrl"
    ]

    for i, label in enumerate(labels):
        col = rows[0].find_all("td")[i + 1]
        p = col.find_all("p")
        stats[f"{label}_A"] = txt(p[0])
        stats[f"{label}_B"] = txt(p[1])

    # --------------------------------------------------------
    # PER ROUND TOTALS TABLE (fixed)
    # --------------------------------------------------------
    per_round_header = soup.select_one("a.b-fight-details__collapse-link_rnd")
    if not per_round_header:
        i_tag = soup.find("i", string=re.compile("Per round", re.I))
        if i_tag:
            per_round_header = i_tag.find_parent("a")

    round_tables_list = []

    if per_round_header:
        table = per_round_header.find_next("table")
        tbody = table.find("tbody")
        current_round = None

        for section in tbody.find_all(["thead", "tr"]):
            if section.name == "thead":
                current_round = txt(section.find("th"))
                continue

            cols = section.find_all("td")
            if len(cols) < 10:
                continue

            p = [c.find_all("p") for c in cols]

            round_tables_list.append({
                "round": current_round,
                "KD_A": txt(p[1][0]),  "KD_B": txt(p[1][1]),
                "SigStr_A": txt(p[2][0]), "SigStr_B": txt(p[2][1]),
                "SigStrPct_A": txt(p[3][0]), "SigStrPct_B": txt(p[3][1]),
                "TotalStr_A": txt(p[4][0]), "TotalStr_B": txt(p[4][1]),
                "Td_A": txt(p[5][0]), "Td_B": txt(p[5][1]),
                "TdPct_A": txt(p[6][0]), "TdPct_B": txt(p[6][1]),
                "SubAtt_A": txt(p[7][0]), "SubAtt_B": txt(p[7][1]),
                "Rev_A": txt(p[8][0]), "Rev_B": txt(p[8][1]),
                "Ctrl_A": txt(p[9][0]), "Ctrl_B": txt(p[9][1]),
            })

    # optional: convert to DF later if needed
    # df_round_totals = pd.DataFrame(round_tables_list)

    # --------------------------------------------------------
    # SIGNIFICANT STRIKES (OVERALL)
    # --------------------------------------------------------
    sig_header = soup.find("p", string=re.compile("Significant Strikes", re.I))
    sig_table = sig_header.find_next("table")
    row0 = sig_table.select("tbody tr")[0].find_all("td")

    sig_stats = {}
    sig_labels = [
        "Sig Str", "Sig Str %", "Head", "Body",
        "Leg", "Distance", "Clinch", "Ground"
    ]

    for i, label in enumerate(sig_labels):
        p = row0[i + 1].find_all("p")
        sig_stats[f"{label}_A"] = txt(p[0])
        sig_stats[f"{label}_B"] = txt(p[1])

    # --------------------------------------------------------
    # SAVE ALL INTO ONE ROW
    # --------------------------------------------------------
    totals_rows.append({
        "fight_url": fight_url,
        "fighter_A": fighter_A,
        "fighter_B": fighter_B,
        **stats,
        **sig_stats,
        # per-round totals stored separately (not merged into CSV)
        "per_round_totals": round_tables_list
    })

    if idx % 200 == 0:
        print(f"Processed {idx}/{len(df_fights)} fights‚Ä¶")

# ------------------------------------------------------------
# SAVE OUTPUT
# ------------------------------------------------------------
df_totals = pd.DataFrame(totals_rows)
df_totals.to_csv(f"{BASE_DIR}/fight_totals.csv", index=False)

print("Saved fight_totals.csv:", df_totals.shape)


Loaded fights: 8482
Processed 0/8482 fights‚Ä¶
Processed 200/8482 fights‚Ä¶
Processed 400/8482 fights‚Ä¶
Processed 600/8482 fights‚Ä¶
Processed 800/8482 fights‚Ä¶
Processed 1000/8482 fights‚Ä¶
Processed 1200/8482 fights‚Ä¶
Processed 1400/8482 fights‚Ä¶
Processed 1600/8482 fights‚Ä¶
Processed 1800/8482 fights‚Ä¶
Processed 2000/8482 fights‚Ä¶
Processed 2200/8482 fights‚Ä¶
Processed 2400/8482 fights‚Ä¶
Processed 2600/8482 fights‚Ä¶
Processed 2800/8482 fights‚Ä¶
Processed 3000/8482 fights‚Ä¶
Processed 3200/8482 fights‚Ä¶
Processed 3400/8482 fights‚Ä¶
Processed 3600/8482 fights‚Ä¶
Processed 3800/8482 fights‚Ä¶
Processed 4000/8482 fights‚Ä¶
Processed 4200/8482 fights‚Ä¶
Processed 4400/8482 fights‚Ä¶
Processed 4600/8482 fights‚Ä¶
Processed 4800/8482 fights‚Ä¶
Processed 5000/8482 fights‚Ä¶
Processed 5200/8482 fights‚Ä¶
Processed 5400/8482 fights‚Ä¶
Processed 5600/8482 fights‚Ä¶
Processed 5800/8482 fights‚Ä¶
Processed 6000/8482 fights‚Ä¶
Processed 6200/8482 fights‚Ä¶
Processed 6400/8482 fights‚

In [1]:
import requests
from bs4 import BeautifulSoup
import re, time, random

# ------------------------------------------------------------
# User-Agent rotation
# ------------------------------------------------------------
UA_LIST = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (X11; Linux x86_64)",
]

def get_soup(url, retries=5):
    for attempt in range(retries):
        headers = {"User-Agent": random.choice(UA_LIST)}
        r = requests.get(url, headers=headers)

        if "<h1>Internal Server Error" not in r.text:
            return BeautifulSoup(r.text, "html.parser")

        wait = random.uniform(3, 7)
        print(f"‚ö†Ô∏è Blocked ‚Äî retry in {wait:.1f}s")
        time.sleep(wait)

    print("‚ùå Permanent fail:", url)
    return None

def txt(x):
    return x.get_text(" ", strip=True) if x else None

# ------------------------------------------------------------
# TEST FIGHT (Petr Yan vs Merab Dvalishvili)
# ------------------------------------------------------------
fight_url = "http://www.ufcstats.com/fight-details/4a0db214d9721d6e"
print("Testing fight:", fight_url)

s = get_soup(fight_url)
if s is None:
    raise ValueError("Failed to load fight page")

# ------------------------------------------------------------
# Extract fighter profile URLs
# ------------------------------------------------------------
fighter_links = s.select("h3.b-fight-details__person-name a")
fighter_urls = [a["href"] for a in fighter_links]

print("\nFound fighter profile URLs:")
for u in fighter_urls:
    print(" ‚Üí", u)

# ------------------------------------------------------------
# Scrape ONE fighter profile (re-usable function)
# ------------------------------------------------------------
def scrape_fighter(url):

    soup = get_soup(url)
    if soup is None:
        return None

    data = {"fighter_url": url}

    # NAME + NICKNAME
    data["name"] = txt(soup.select_one("span.b-content__title-highlight"))
    data["nickname"] = txt(soup.select_one("p.b-content__Nickname"))

    # BASIC BIO
    for li in soup.select("li.b-list__box-list-item"):
        t = txt(li)
        if not t:
            continue
        if t.startswith("Height:"):
            data["height"] = t.replace("Height:", "").strip()
        elif t.startswith("Weight:"):
            data["weight"] = t.replace("Weight:", "").strip()
        elif t.startswith("Reach:"):
            data["reach"] = t.replace("Reach:", "").strip()
        elif "STANCE" in t.upper():
            data["stance"] = t.split(":")[-1].strip()
        elif t.startswith("DOB:"):
            data["dob"] = t.replace("DOB:", "").strip()
        elif t.startswith("Wins:"):
            data["wins"] = t.replace("Wins:", "").strip()
        elif t.startswith("Losses:"):
            data["losses"] = t.replace("Losses:", "").strip()
        elif t.startswith("Draws:"):
            data["draws"] = t.replace("Draws:", "").strip()
        elif t.startswith("No Contest:"):
            data["nc"] = t.replace("No Contest:", "").strip()

    # ADVANCED STATS
    for li in soup.select("ul.b-list__box-list.b-list__box-list_margin-top li"):
        t = txt(li)
        if not t:
            continue
        if "SLpM:" in t:
            data["SLpM"] = t.split(":")[-1].strip()
        elif "Str. Acc.:" in t:
            data["Str_Acc"] = t.split(":")[-1].strip()
        elif "SApM:" in t:
            data["SApM"] = t.split(":")[-1].strip()
        elif "Str. Def.:" in t:
            data["Str_Def"] = t.split(":")[-1].strip()
        elif "TD Avg.:" in t:
            data["TD_Avg"] = t.split(":")[-1].strip()
        elif "TD Acc.:" in t:
            data["TD_Acc"] = t.split(":")[-1].strip()
        elif "TD Def.:" in t:
            data["TD_Def"] = t.split(":")[-1].strip()
        elif "Sub. Avg.:" in t:
            data["Sub_Avg"] = t.split(":")[-1].strip()

    return data


# ------------------------------------------------------------
# RUN TEST: scrape both fighters
# ------------------------------------------------------------
from pprint import pprint

print("\n================= Fighter A =================")
A = scrape_fighter(fighter_urls[0])
pprint(A)

print("\n================= Fighter B =================")
B = scrape_fighter(fighter_urls[1])
pprint(B)


Testing fight: http://www.ufcstats.com/fight-details/4a0db214d9721d6e

Found fighter profile URLs:
 ‚Üí http://www.ufcstats.com/fighter-details/c03520b5c88ed6b4
 ‚Üí http://www.ufcstats.com/fighter-details/d661ce4da776fc20

{'SApM': '2.80',
 'SLpM': '4.42',
 'Str_Acc': '41%',
 'Sub_Avg': '0.3',
 'TD_Acc': '35%',
 'TD_Avg': '5.97',
 'TD_Def': '76%',
 'dob': 'Jan 10, 1991',
 'fighter_url': 'http://www.ufcstats.com/fighter-details/c03520b5c88ed6b4',
 'height': '5\' 6"',
 'name': 'Merab Dvalishvili',
 'nickname': 'The Machine',
 'reach': '68"',
 'stance': 'Orthodox',
 'weight': '135 lbs.'}

{'SApM': '4.24',
 'SLpM': '5.16',
 'Str_Acc': '54%',
 'Sub_Avg': '0.1',
 'TD_Acc': '49%',
 'TD_Avg': '1.70',
 'TD_Def': '86%',
 'dob': 'Feb 11, 1993',
 'fighter_url': 'http://www.ufcstats.com/fighter-details/d661ce4da776fc20',
 'height': '5\' 7"',
 'name': 'Petr Yan',
 'nickname': 'No Mercy',
 'reach': '67"',
 'stance': 'Switch',
 'weight': '135 lbs.'}


In [2]:
# ============================================================
# CHUNK 4 ‚Äî FIGHTER PROFILE SCRAPER (ADVANCED STATS EDITION)
# ============================================================

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time, random

BASE_DIR = "/Users/shrey24/Desktop/ufc-s-tier"
fights_path = f"{BASE_DIR}/fights.csv"

df_fights = pd.read_csv(fights_path)
print("Loaded fights:", len(df_fights))

# ------------------------------------------------------------
# User-Agent rotation (avoid blocking)
# ------------------------------------------------------------
UA_LIST = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (X11; Linux x86_64)",
]

def get_soup(url, retries=5):
    """Fetch & retry when UFCStats blocks temporarily."""
    for attempt in range(retries):
        headers = {"User-Agent": random.choice(UA_LIST)}
        r = requests.get(url, headers=headers)

        if "<h1>Internal Server Error" not in r.text:
            return BeautifulSoup(r.text, "html.parser")

        wait = random.uniform(3, 7)
        print(f"‚ö†Ô∏è Blocked fetching {url} ‚Äî retrying in {wait:.1f}s‚Ä¶")
        time.sleep(wait)

    print("‚ùå Permanent failure:", url)
    return None

def txt(x):
    return x.get_text(" ", strip=True) if x else None

# ------------------------------------------------------------
# STEP 1 ‚Äî Gather unique fighter profile URLs
# ------------------------------------------------------------
fighter_urls = set()

print("\nExtracting fighter profile URLs‚Ä¶")

for idx, row in df_fights.iterrows():
    soup = get_soup(row["fight_url"])
    if soup is None:
        continue

    for a in soup.select("h3.b-fight-details__person-name a"):
        fighter_urls.add(a["href"])

    if idx % 300 == 0:
        print(f"Processed {idx}/{len(df_fights)} fights‚Ä¶")

fighter_urls = list(fighter_urls)
print(f"üìå Found {len(fighter_urls)} unique fighter profiles.")

# ------------------------------------------------------------
# STEP 2 ‚Äî Scrape fighter profile pages
# ------------------------------------------------------------

profiles = []

for idx, url in enumerate(fighter_urls):

    soup = get_soup(url)
    if soup is None:
        continue

    # ---------------------------
    # Name
    # ---------------------------
    name = txt(soup.select_one("span.b-content__title-highlight"))
    nickname = txt(soup.select_one("p.b-content__Nickname"))

    # ---------------------------
    # Basic Bio Section
    # ---------------------------
    bio = {
        "height": None,
        "weight": None,
        "reach": None,
        "stance": None,
        "dob": None,
        "wins": None,
        "losses": None,
        "draws": None,
        "nc": None
    }

    for li in soup.select("li.b-list__box-list-item"):
        t = txt(li)
        if not t:
            continue

        if t.startswith("Height:"):
            bio["height"] = t.replace("Height:", "").strip()
        elif t.startswith("Weight:"):
            bio["weight"] = t.replace("Weight:", "").strip()
        elif t.startswith("Reach:"):
            bio["reach"] = t.replace("Reach:", "").strip()
        elif "STANCE:" in t.upper():
            bio["stance"] = t.split(":")[-1].strip()
        elif t.startswith("DOB:"):
            bio["dob"] = t.replace("DOB:", "").strip()
        elif "Wins:" in t:
            bio["wins"] = t.replace("Wins:", "").strip()
        elif "Losses:" in t:
            bio["losses"] = t.replace("Losses:", "").strip()
        elif "Draws:" in t:
            bio["draws"] = t.replace("Draws:", "").strip()
        elif "No Contest:" in t:
            bio["nc"] = t.replace("No Contest:", "").strip()

    # ---------------------------
    # Advanced Stats (SLpM, SApM, Acc, etc.)
    # ---------------------------
    adv_stats = {
        "SLpM": None,
        "Str_Acc": None,
        "SApM": None,
        "Str_Def": None,
        "TD_Avg": None,
        "TD_Acc": None,
        "TD_Def": None,
        "Sub_Avg": None
    }

    stat_rows = soup.select("ul.b-list__box-list.b-list__box-list_margin-top li")

    for li in stat_rows:
        t = txt(li)
        if not t:
            continue

        if "SLpM:" in t:
            adv_stats["SLpM"] = t.split(":")[-1].strip()
        elif "Str. Acc.:" in t:
            adv_stats["Str_Acc"] = t.split(":")[-1].strip()
        elif "SApM:" in t:
            adv_stats["SApM"] = t.split(":")[-1].strip()
        elif "Str. Def.:" in t:
            adv_stats["Str_Def"] = t.split(":")[-1].strip()
        elif "TD Avg.:" in t:
            adv_stats["TD_Avg"] = t.split(":")[-1].strip()
        elif "TD Acc.:" in t:
            adv_stats["TD_Acc"] = t.split(":")[-1].strip()
        elif "TD Def.:" in t:
            adv_stats["TD_Def"] = t.split(":")[-1].strip()
        elif "Sub. Avg.:" in t:
            adv_stats["Sub_Avg"] = t.split(":")[-1].strip()

    profiles.append({
        "fighter_url": url,
        "name": name,
        "nickname": nickname,
        **bio,
        **adv_stats
    })

    if idx % 100 == 0:
        print(f"Scraped {idx}/{len(fighter_urls)} fighters‚Ä¶")

# ------------------------------------------------------------
# SAVE OUTPUT
# ------------------------------------------------------------
df_profiles = pd.DataFrame(profiles)
out_path = f"{BASE_DIR}/fighters_advanced.csv"
df_profiles.to_csv(out_path, index=False)

print("\n===================================================")
print(f"‚úÖ Saved fighters_advanced.csv ‚Äî shape {df_profiles.shape}")
print("===================================================")
df_profiles.head()


Loaded fights: 8482

Extracting fighter profile URLs‚Ä¶
Processed 0/8482 fights‚Ä¶
Processed 300/8482 fights‚Ä¶
Processed 600/8482 fights‚Ä¶
Processed 900/8482 fights‚Ä¶
Processed 1200/8482 fights‚Ä¶
Processed 1500/8482 fights‚Ä¶
Processed 1800/8482 fights‚Ä¶
Processed 2100/8482 fights‚Ä¶
Processed 2400/8482 fights‚Ä¶
Processed 2700/8482 fights‚Ä¶
Processed 3000/8482 fights‚Ä¶
Processed 3300/8482 fights‚Ä¶
Processed 3600/8482 fights‚Ä¶
Processed 3900/8482 fights‚Ä¶
Processed 4200/8482 fights‚Ä¶
Processed 4500/8482 fights‚Ä¶
Processed 4800/8482 fights‚Ä¶
Processed 5100/8482 fights‚Ä¶
Processed 5400/8482 fights‚Ä¶
Processed 5700/8482 fights‚Ä¶
Processed 6000/8482 fights‚Ä¶
Processed 6300/8482 fights‚Ä¶
Processed 6600/8482 fights‚Ä¶
Processed 6900/8482 fights‚Ä¶
Processed 7200/8482 fights‚Ä¶
Processed 7500/8482 fights‚Ä¶
Processed 7800/8482 fights‚Ä¶
Processed 8100/8482 fights‚Ä¶
Processed 8400/8482 fights‚Ä¶
üìå Found 2638 unique fighter profiles.
Scraped 0/2638 fighters‚Ä¶
Scraped 100/

Unnamed: 0,fighter_url,name,nickname,height,weight,reach,stance,dob,wins,losses,draws,nc,SLpM,Str_Acc,SApM,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg
0,http://www.ufcstats.com/fighter-details/15ea37...,Roberto Romero,El Charro Negro,"5' 8""",145 lbs.,"70""",Orthodox,"Jan 30, 2000",,,,,4.56,49%,6.44,,0.0,0%,71%,0.0
1,http://www.ufcstats.com/fighter-details/cbf1c7...,Salim Touahri,Grizzly,"5' 10""",170 lbs.,"72""",Orthodox,"Sep 28, 1989",,,,,2.6,46%,2.87,,0.0,0%,83%,0.0
2,http://www.ufcstats.com/fighter-details/d343df...,Nina Nunes,,"5' 5""",125 lbs.,"64""",Orthodox,"Dec 03, 1985",,,,,4.39,45%,3.47,,0.24,33%,73%,0.5
3,http://www.ufcstats.com/fighter-details/c17041...,Alex White,The Spartan,"6' 0""",155 lbs.,"71""",Southpaw,"Oct 22, 1988",,,,,3.76,42%,3.07,,0.92,28%,71%,0.5
4,http://www.ufcstats.com/fighter-details/38c7f7...,Gillian Robertson,The Savage,"5' 5""",115 lbs.,"63""",Orthodox,"May 17, 1995",,,,,2.86,48%,2.86,,2.74,40%,41%,0.9


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random, time

# Load your existing profiles file
BASE_DIR = "/Users/shrey24/Desktop/ufc-s-tier"
df_profiles = pd.read_csv(f"{BASE_DIR}/fighters_advanced.csv")

print("Profiles loaded:", len(df_profiles))

UA_LIST = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (X11; Linux x86_64)",
]

def get_soup(url, retries=5):
    for attempt in range(retries):
        headers = {"User-Agent": random.choice(UA_LIST)}
        r = requests.get(url, headers=headers)

        if "<h1>Internal Server Error" not in r.text:
            return BeautifulSoup(r.text, "html.parser")

        wait = random.uniform(3, 7)
        print(f"‚ö†Ô∏è Blocked: retrying {url} in {wait:.1f}s‚Ä¶")
        time.sleep(wait)

    print("‚ùå Permanent failure:", url)
    return None

def txt(x):
    return x.get_text(" ", strip=True) if x else None


def scrape_fight_history(soup):
    rows = soup.select("tr.b-fight-details__table-row")
    history = []

    for r in rows:
        cols = r.find_all("td")
        if len(cols) < 10:
            continue

        wl   = txt(cols[0])
        opp  = txt(cols[1])
        kd   = txt(cols[2])
        strx = txt(cols[3])
        td   = txt(cols[4])
        sub  = txt(cols[5])
        event = txt(cols[6])
        event_link = cols[6].find("a")["href"] if cols[6].find("a") else None
        method = txt(cols[7])
        round_ = txt(cols[8])
        time_ = txt(cols[9])

        history.append({
            "wl": wl,
            "opponent": opp,
            "kd": kd,
            "str": strx,
            "td": td,
            "sub": sub,
            "event": event,
            "event_url": event_link,
            "method": method,
            "round": round_,
            "time": time_
        })

    return history


# ============================================================
# üî• PICK ONE FIGHTER TO TEST
# ============================================================

test_url = df_profiles["fighter_url"].iloc[0]
test_name = df_profiles["name"].iloc[0]

print("\nTesting scrape for:", test_name)
print("URL:", test_url)

soup = get_soup(test_url)

if soup:
    history = scrape_fight_history(soup)
    print(f"\nScraped {len(history)} fights for {test_name}")
    for h in history[:5]:
        print(h)
else:
    print("‚ùå Failed to load fighter page")


Profiles loaded: 2638

Testing scrape for: Roberto Romero
URL: http://www.ufcstats.com/fighter-details/15ea371202eb25e1

Scraped 2 fights for Roberto Romero
{'wl': 'loss', 'opponent': 'Roberto Romero Timmy Cuamba', 'kd': '0 1', 'str': '35 33', 'td': '0 1', 'sub': '0 0', 'event': 'UFC Fight Night: Machado Garry vs. Prates Apr. 26, 2025', 'event_url': 'http://www.ufcstats.com/event-details/b2e3aca4cd363477', 'method': 'KO/TKO Flying Knee', 'round': '2', 'time': '3:55'}
{'wl': 'loss', 'opponent': 'Roberto Romero David Onama', 'kd': '1 1', 'str': '74 121', 'td': '0 1', 'sub': '0 1', 'event': 'UFC 309: Jones vs. Miocic Nov. 16, 2024', 'event_url': 'http://www.ufcstats.com/event-details/daff32bc96d1eabf', 'method': 'U-DEC', 'round': '3', 'time': '5:00'}


In [6]:
# ============================================================
# CHUNK 4B ‚Äî SCRAPE FIGHT HISTORY ONLY (NO PROFILE RESCRAPE)
# ============================================================

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time, random

BASE_DIR = "/Users/shrey24/Desktop/ufc-s-tier"

# Load already-downloaded fighter profiles
profiles_path = f"{BASE_DIR}/fighters_advanced.csv"
df_profiles = pd.read_csv(profiles_path)

print("Loaded fighter profiles:", len(df_profiles))

UA_LIST = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "Mozilla/5.0 (X11; Linux x86_64)",
]

def get_soup(url, retries=5):
    for attempt in range(retries):
        headers = {"User-Agent": random.choice(UA_LIST)}
        r = requests.get(url, headers=headers)

        # UFCStats returns Internal Server Error sometimes
        if "<h1>Internal Server Error" not in r.text:
            return BeautifulSoup(r.text, "html.parser")

        wait = random.uniform(3, 7)
        print(f"‚ö†Ô∏è Blocked: retrying {url} in {wait:.1f}s‚Ä¶")
        time.sleep(wait)

    print("‚ùå Permanent failure:", url)
    return None

def txt(x):
    return x.get_text(" ", strip=True) if x else None


# ============================================================
# PARSE FIGHT HISTORY TABLE
# ============================================================
def scrape_fight_history(soup):
    history = []
    rows = soup.select("tr.b-fight-details__table-row")

    for r in rows:
        cols = r.find_all("td")
        if len(cols) < 10:
            continue

        wl = txt(cols[0])
        opponent = txt(cols[1])
        kd = txt(cols[2])
        str_ = txt(cols[3])
        td = txt(cols[4])
        sub = txt(cols[5])
        event = txt(cols[6])

        # event link inside <a>
        event_link = cols[6].find("a")["href"] if cols[6].find("a") else None

        method = txt(cols[7])
        round_ = txt(cols[8])
        time_ = txt(cols[9])

        history.append({
            "wl": wl,
            "opponent": opponent,
            "kd": kd,
            "str": str_,
            "td": td,
            "sub": sub,
            "event": event,
            "event_url": event_link,
            "method": method,
            "round": round_,
            "time": time_
        })

    return history


# ============================================================
# SCRAPE HISTORY FOR ALL FIGHTERS
# ============================================================

fight_history_rows = []

fighter_urls = df_profiles["fighter_url"].unique()
print("Scraping fight histories for", len(fighter_urls), "fighters‚Ä¶")

for idx, url in enumerate(fighter_urls):

    soup = get_soup(url)
    if soup is None:
        continue

    name = df_profiles.loc[df_profiles["fighter_url"] == url, "name"].iloc[0]

    history = scrape_fight_history(soup)

    for h in history:
        fight_history_rows.append({
            "fighter_url": url,
            "fighter_name": name,
            **h
        })

    if idx % 100 == 0:
        print(f"Scraped {idx}/{len(fighter_urls)} fighters‚Ä¶")


# ============================================================
# SAVE OUTPUT
# ============================================================

df_history = pd.DataFrame(fight_history_rows)
out_path = f"{BASE_DIR}/fighters_fight_history.csv"
df_history.to_csv(out_path, index=False)

print("\n===================================================")
print(f"‚úÖ Saved fighters_fight_history.csv ‚Äî shape {df_history.shape}")
print("===================================================")


Loaded fighter profiles: 2638
Scraping fight histories for 2638 fighters‚Ä¶
Scraped 0/2638 fighters‚Ä¶
Scraped 100/2638 fighters‚Ä¶
Scraped 200/2638 fighters‚Ä¶
Scraped 300/2638 fighters‚Ä¶
Scraped 400/2638 fighters‚Ä¶
Scraped 500/2638 fighters‚Ä¶
Scraped 600/2638 fighters‚Ä¶
Scraped 700/2638 fighters‚Ä¶
Scraped 800/2638 fighters‚Ä¶
Scraped 900/2638 fighters‚Ä¶
Scraped 1000/2638 fighters‚Ä¶
Scraped 1100/2638 fighters‚Ä¶
Scraped 1200/2638 fighters‚Ä¶
Scraped 1300/2638 fighters‚Ä¶
Scraped 1400/2638 fighters‚Ä¶
Scraped 1500/2638 fighters‚Ä¶
Scraped 1600/2638 fighters‚Ä¶
Scraped 1700/2638 fighters‚Ä¶
Scraped 1800/2638 fighters‚Ä¶
Scraped 1900/2638 fighters‚Ä¶
Scraped 2000/2638 fighters‚Ä¶
Scraped 2100/2638 fighters‚Ä¶
Scraped 2200/2638 fighters‚Ä¶
Scraped 2300/2638 fighters‚Ä¶
Scraped 2400/2638 fighters‚Ä¶
Scraped 2500/2638 fighters‚Ä¶
Scraped 2600/2638 fighters‚Ä¶

‚úÖ Saved fighters_fight_history.csv ‚Äî shape (19535, 13)


In [7]:
df_history.head()

Unnamed: 0,fighter_url,fighter_name,wl,opponent,kd,str,td,sub,event,event_url,method,round,time
0,http://www.ufcstats.com/fighter-details/15ea37...,Roberto Romero,loss,Roberto Romero Timmy Cuamba,0 1,35 33,0 1,0 0,UFC Fight Night: Machado Garry vs. Prates Apr....,http://www.ufcstats.com/event-details/b2e3aca4...,KO/TKO Flying Knee,2,3:55
1,http://www.ufcstats.com/fighter-details/15ea37...,Roberto Romero,loss,Roberto Romero David Onama,1 1,74 121,0 1,0 1,"UFC 309: Jones vs. Miocic Nov. 16, 2024",http://www.ufcstats.com/event-details/daff32bc...,U-DEC,3,5:00
2,http://www.ufcstats.com/fighter-details/cbf1c7...,Salim Touahri,loss,Salim Touahri Mickey Gall,0 0,48 50,0 1,0 0,"UFC Fight Night: Covington vs. Lawler Aug. 03,...",http://www.ufcstats.com/event-details/03da33a1...,U-DEC,3,5:00
3,http://www.ufcstats.com/fighter-details/cbf1c7...,Salim Touahri,loss,Salim Touahri Keita Nakamura,0 1,34 41,0 0,0 0,UFC Fight Night: Dos Santos vs. Tuivasa Dec. 0...,http://www.ufcstats.com/event-details/d1d20e65...,S-DEC,3,5:00
4,http://www.ufcstats.com/fighter-details/cbf1c7...,Salim Touahri,loss,Salim Touahri Warlley Alves,0 0,35 38,0 2,0 0,"UFC Fight Night: Cerrone vs. Till Oct. 21, 2017",http://www.ufcstats.com/event-details/d6b68eaf...,U-DEC,3,5:00


In [8]:
# ============================================================
# CLEANING PIPELINE FOR GLICKO-2
# ============================================================

import pandas as pd
import re
from dateutil import parser

# ------------------------------------------------------------
# Load raw fight history
# ------------------------------------------------------------
BASE_DIR = "/Users/shrey24/Desktop/ufc-s-tier"
df_history = pd.read_csv(f"{BASE_DIR}/fighters_fight_history.csv")
print("Loaded fight history:", df_history.shape)

# ------------------------------------------------------------
# Helper: clean opponent name by removing fighter's own name
# ------------------------------------------------------------
def clean_opponent(row):
    fighter = str(row["fighter_name"]).strip()
    opp = str(row["opponent"]).strip()

    # Remove fighter name from opponent string
    opp_clean = opp.replace(fighter, "").strip()

    # Normalize whitespace
    opp_clean = " ".join(opp_clean.split())
    return opp_clean

# ------------------------------------------------------------
# Helper: extract date from event string
# Example: "UFC Fight Night: X vs Y Apr. 26, 2025"
# ------------------------------------------------------------
def extract_date(event_str):
    if pd.isna(event_str):
        return None

    # Detect patterns like "Apr. 26, 2025"
    match = re.search(r"[A-Za-z]{3,}\.? \s*\d{1,2}, \s*\d{4}", event_str)
    if match:
        try:
            return parser.parse(match.group(0))
        except:
            return None
    return None

# ------------------------------------------------------------
# Helper: convert WL ‚Üí numeric score (Glicko format)
# ------------------------------------------------------------
def wl_to_score(wl):
    if wl is None:
        return None
    wl = wl.lower()
    if "win" in wl:
        return 1.0
    if "loss" in wl:
        return 0.0
    if "draw" in wl:
        return 0.5
    return None

# ------------------------------------------------------------
# Apply cleaning operations
# ------------------------------------------------------------
df_clean = df_history.copy()

df_clean["opponent_clean"] = df_clean.apply(clean_opponent, axis=1)
df_clean["date"] = df_clean["event"].apply(extract_date)
df_clean["score"] = df_clean["wl"].apply(wl_to_score)

# Drop rows without dates or outcomes
df_clean = df_clean.dropna(subset=["date", "score"])

# Sort fights chronologically (REQUIRED for Glicko)
df_clean = df_clean.sort_values("date").reset_index(drop=True)

print("\nCleaning complete. Final shape:", df_clean.shape)
print(df_clean.head(10))

# ------------------------------------------------------------
# Save cleaned table
# ------------------------------------------------------------
df_clean.to_csv(f"{BASE_DIR}/fighters_history_cleaned.csv", index=False)
print("\nSaved cleaned file ‚Üí fighters_history_cleaned.csv")


Loaded fight history: (19535, 13)

Cleaning complete. Final shape: (19325, 16)
                                         fighter_url   fighter_name    wl  \
0  http://www.ufcstats.com/fighter-details/429e7d...   Royce Gracie   win   
1  http://www.ufcstats.com/fighter-details/d3711d...   Zane Frazier  loss   
2  http://www.ufcstats.com/fighter-details/02fc8f...  Trent Jenkins  loss   
3  http://www.ufcstats.com/fighter-details/598a58...   Kevin Rosier  loss   
4  http://www.ufcstats.com/fighter-details/6ceff8...  Jason DeLucia   win   
5  http://www.ufcstats.com/fighter-details/598a58...   Kevin Rosier   win   
6  http://www.ufcstats.com/fighter-details/63b65a...   Ken Shamrock  loss   
7  http://www.ufcstats.com/fighter-details/429e7d...   Royce Gracie   win   
8  http://www.ufcstats.com/fighter-details/429e7d...   Royce Gracie   win   
9  http://www.ufcstats.com/fighter-details/63b65a...   Ken Shamrock   win   

                      opponent   kd    str   td  sub  \
0    Royce Gracie

In [23]:
df_history = pd.read_csv(f"{BASE_DIR}/fighters_history_cleaned.csv")

# Show the first 50 unique event names
print(df_history["event"].unique()[:50])

# Show how many unique events total
print("Total unique event names:", df_history["event"].nunique())


['UFC 1: The Beginning Nov. 12, 1993' 'UFC 2: No Way Out Mar. 11, 1994'
 'UFC 3: The American Dream Sep. 09, 1994'
 'UFC 4: Revenge of the Warriors Dec. 16, 1994'
 'UFC 5: The Return of the Beast Apr. 07, 1995'
 'UFC 6: Clash of the Titans Jul. 14, 1995'
 'UFC 7: The Brawl in Buffalo Sep. 08, 1995'
 "UFC - Ultimate Ultimate '95 Dec. 16, 1995"
 'UFC 8: David vs Goliath Feb. 16, 1996'
 'UFC 9: Motor City Madness May. 17, 1996'
 'UFC 10: The Tournament Jul. 12, 1996'
 'UFC 11: The Proving Ground Sep. 20, 1996'
 "UFC - Ultimate Ultimate '96 Dec. 07, 1996"
 'UFC 12: Judgement Day Feb. 07, 1997'
 'UFC 13: The Ultimate Force May. 30, 1997'
 "Brazil Open '97 Jun. 15, 1997" 'UFC 14: Showdown Jul. 27, 1997'
 'PRIDE 1 Oct. 11, 1997' 'UFC 15: Collision Course Oct. 17, 1997'
 'UFC - Ultimate Japan Dec. 21, 1997' 'PRIDE 2 Mar. 05, 1998'
 'UFC 16: Battle in the Bayou Mar. 13, 1998'
 'UFC 17: Redemption May. 15, 1998' 'PRIDE 3 Jun. 24, 1998'
 'PRIDE 4 Oct. 11, 1998' 'UFC - Ultimate Brazil Oct. 16, 199

In [26]:
# ============================================================
# ADVANCED GLICKO-2 ENGINE FOR UFC ‚Äî FAST VERSION (CONSTANT VOL)
# ============================================================

import numpy as np
import pandas as pd
from math import log, sqrt, pi
import time

# ===============================
# 1. GLICKO-2 HELPER FUNCTIONS
# ===============================

def _g(RD):
    return 1 / np.sqrt(1 + 3*(RD**2) / (pi**2))

def _E(r, r_j, RD_j):
    return 1 / (1 + np.exp(-_g(RD_j)*(r - r_j)))


# ===============================
# 2. FIGHTER CLASS (NO VOL UPDATE)
# ===============================

class Fighter:
    def __init__(self, name, rating=1500, RD=350, volatility=0.06):
        self.name = name
        self.rating = rating
        self.RD = RD
        self.vol = volatility  # stays constant

    def update_player(self, opp_ratings, opp_RDs, scores):
        if len(opp_ratings) == 0:
            return

        mu = (self.rating - 1500) / 173.7178
        phi = self.RD / 173.7178
        sigma = self.vol

        v_inv = 0
        delta_sum = 0

        for r_j, RD_j, s_j in zip(opp_ratings, opp_RDs, scores):
            mu_j = (r_j - 1500)/173.7178
            phi_j = RD_j/173.7178
            E_j = _E(mu, mu_j, phi_j)
            g_j = _g(phi_j)

            v_inv += (g_j**2) * E_j * (1 - E_j)
            delta_sum += g_j * (s_j - E_j)

        v = 1 / v_inv
        mu_prime = mu + (phi**2 / (phi**2 + sigma**2 + v)) * delta_sum

        # Keep volatility constant
        phi_star = sqrt(phi**2 + sigma**2)
        phi_prime = 1 / sqrt(1/(phi_star**2) + 1/v)

        self.rating = 173.7178 * mu_prime + 1500
        self.RD = 173.7178 * phi_prime
        self.vol = sigma  # unchanged


# ===============================
# 3. ADVANCED SCORE ADJUSTMENT
# ===============================

def compute_advanced_score(base_score, opp_rating, avg_rating,
                           importance_factor,
                           kd_A, kd_B,
                           sig_A, sig_B,
                           td_A, td_B):

    difficulty = opp_rating / avg_rating
    performance = 1.0

    if base_score == 1:
        if kd_A > kd_B:
            performance *= 1.10
        if sig_A > sig_B:
            performance *= 1.05
        if td_A > td_B:
            performance *= 1.05
        if (sig_B > sig_A) and (kd_B > kd_A):
            performance *= 0.85

    return base_score * difficulty * importance_factor * performance


# ===============================
# 4. LOAD + FILTER DATA (UFC-ONLY)
# ===============================

BASE_DIR = "/Users/shrey24/Desktop/ufc-s-tier"

df_fights   = pd.read_csv(f"{BASE_DIR}/fights.csv")
df_totals   = pd.read_csv(f"{BASE_DIR}/fight_totals.csv")
df_history  = pd.read_csv(f"{BASE_DIR}/fighters_history_cleaned.csv")

# FILTER HISTORY TO UFC ONLY
df_history = df_history[df_history["event"].str.contains("UFC", na=False)]

print("Filtered Fighter History:", len(df_history))


# ===============================
# 5. MERGE DATES
# ===============================

df_dates = (
    df_history
    .groupby(["event_url", "fighter_name"])["date"]
    .first()
    .reset_index()
)

df_fights = df_fights.merge(
    df_dates.rename(columns={"fighter_name": "fighter_A", "date": "date_A"}),
    on=["event_url", "fighter_A"],
    how="left"
)

df_fights = df_fights.merge(
    df_dates.rename(columns={"fighter_name": "fighter_B", "date": "date_B"}),
    on=["event_url", "fighter_B"],
    how="left"
)

df_fights["date"] = df_fights["date_A"].fillna(df_fights["date_B"])
df_fights.drop(columns=["date_A", "date_B"], inplace=True)

df_fights["date"] = pd.to_datetime(df_fights["date"])
df_fights = df_fights.sort_values(["date", "fight_order"]).reset_index(drop=True)

print("Fights After Merge:", len(df_fights))


# ===============================
# 6. INITIALIZE FIGHTERS
# ===============================

fighters = {name: Fighter(name) for name in df_history["fighter_name"].unique()}


# ===============================
# 7. MAIN LOOP + PROGRESS
# ===============================

avg_rating = 1500
start_time = time.time()

total_fights = len(df_fights)

for i, row in df_fights.iterrows():

    A = row["fighter_A"]
    B = row["fighter_B"]

    if A not in fighters: fighters[A] = Fighter(A)
    if B not in fighters: fighters[B] = Fighter(B)

    # Scores
    wl = row["WL_label"].lower()
    sA, sB = (1,0) if wl=="win" else (0,1) if wl=="loss" else (0.5,0.5)

    # Importance weighting
    fight_order = int(row["fight_order"])
    event_name  = row["event_name"]

    importance = 1.0
    if "UFC Fight Night" not in event_name and "UFC" in event_name:
        importance = {1:1.15, 2:1.10}.get(fight_order, 1.05)
    else:
        importance = {1:1.10, 2:1.05}.get(fight_order, 1.0)

    # Totals
    subset = df_totals[df_totals["fight_url"] == row["fight_url"]]
    if subset.empty:
    # Skip early UFC events with missing totals
        continue
    t = subset.iloc[0]

    scoreA = compute_advanced_score(
        sA, fighters[B].rating, avg_rating, importance,
        t["KD_A"], t["KD_B"], t["Sig Str_A"], t["Sig Str_B"], t["Td_A"], t["Td_B"]
    )

    scoreB = compute_advanced_score(
        sB, fighters[A].rating, avg_rating, importance,
        t["KD_B"], t["KD_A"], t["Sig Str_B"], t["Sig Str_A"], t["Td_B"], t["Td_A"]
    )

    fighters[A].update_player([fighters[B].rating], [fighters[B].RD], [scoreA])
    fighters[B].update_player([fighters[A].rating], [fighters[A].RD], [scoreB])

    # Progress
    if (i + 1) % 500 == 0:
        elapsed = time.time() - start_time
        print(f"[{i+1}/{total_fights}] {A} vs {B} | Date: {row['date']} | Time: {elapsed:.2f}s")


print(f"\nDONE! Total time: {time.time() - start_time:.2f}s")


Filtered Fighter History: 16260
Fights After Merge: 8482
[500/8482] Thiago Alves vs Ansar Chalangov | Date: 2005-11-19 00:00:00 | Time: 0.21s
[1000/8482] Demian Maia vs Chael Sonnen | Date: 2009-02-21 00:00:00 | Time: 0.41s
[1500/8482] Sam Stout vs Yves Edwards | Date: 2011-06-11 00:00:00 | Time: 0.61s
[2000/8482] Pascal Krauss vs Mike Stumpf | Date: 2013-01-26 00:00:00 | Time: 0.81s
[2500/8482] Justin Salas vs Ben Wall | Date: 2014-05-10 00:00:00 | Time: 1.01s
[3000/8482] Mirsad Bektic vs Lucas Martins | Date: 2015-05-30 00:00:00 | Time: 1.22s
[3500/8482] Michel Prazeres vs JC Cottrell | Date: 2016-07-23 00:00:00 | Time: 1.43s
[4000/8482] Sarah Moras vs Ashlee Evans-Smith | Date: 2017-09-09 00:00:00 | Time: 1.64s
[4500/8482] Talita Bernardo vs Sarah Moras | Date: 2018-10-27 00:00:00 | Time: 1.84s
[5000/8482] Cub Swanson vs Kron Gracie | Date: 2019-10-12 00:00:00 | Time: 2.04s
[5500/8482] Khaos Williams vs Abdul Razak Alhassan | Date: 2020-11-14 00:00:00 | Time: 2.24s
[6000/8482] Max H

In [28]:
# Convert fighters dict to a DataFrame
ratings_df = pd.DataFrame([
    {
        "fighter_name": f.name,
        "rating": f.rating,
        "RD": f.RD,
        "volatility": f.vol
    }
    for f in fighters.values()
])

# Sort by rating (descending)
ratings_df = ratings_df.sort_values("rating", ascending=False).reset_index(drop=True)

print(ratings_df.head(20))


             fighter_name       rating          RD  volatility
0               Jon Jones  1790.900284   84.648332        0.06
1          Anderson Silva  1729.688990   85.436890        0.06
2            Kamaru Usman  1720.229587   90.578832        0.06
3          Daniel Cormier  1708.521504   97.585649        0.06
4       Georges St-Pierre  1707.583212   86.885937        0.06
5   Alexander Volkanovski  1702.085032   93.321560        0.06
6         Islam Makhachev  1696.600801   91.345023        0.06
7         Israel Adesanya  1696.095144   91.677730        0.06
8     Khabib Nurmagomedov  1693.239863  107.174573        0.06
9            Ilia Topuria  1690.470968  122.851840        0.06
10           Alex Pereira  1678.738567  107.135064        0.06
11          Lyoto Machida  1678.027665   84.372669        0.06
12      Shavkat Rakhmonov  1677.863252  132.619700        0.06
13              Jose Aldo  1677.525439   83.951709        0.06
14         Cain Velasquez  1677.330047  104.431637     

In [25]:
missing = df_fights[~df_fights["fight_url"].isin(df_totals["fight_url"])]
print("Missing totals count:", len(missing))
print(missing[["event_name", "fight_url"]].head(20))


Missing totals count: 21
                         event_name  \
29   UFC 4: Revenge of the Warriors   
30   UFC 4: Revenge of the Warriors   
49       UFC 6: Clash of the Titans   
50       UFC 6: Clash of the Titans   
60      UFC 7: The Brawl in Buffalo   
61      UFC 7: The Brawl in Buffalo   
69      UFC - Ultimate Ultimate '95   
70      UFC - Ultimate Ultimate '95   
79          UFC 8: David vs Goliath   
94           UFC 10: The Tournament   
102      UFC 11: The Proving Ground   
103      UFC 11: The Proving Ground   
111     UFC - Ultimate Ultimate '96   
112     UFC - Ultimate Ultimate '96   
113     UFC - Ultimate Ultimate '96   
121           UFC 12: Judgement Day   
122           UFC 12: Judgement Day   
161     UFC 16: Battle in the Bayou   
162     UFC 16: Battle in the Bayou   
170              UFC 17: Redemption   

                                             fight_url  
29   http://www.ufcstats.com/fight-details/b80e6a79...  
30   http://www.ufcstats.com/fight-detail

In [112]:
import pandas as pd
import numpy as np

BASE = "/Users/shrey24/Desktop/ufc-s-tier"

df_events   = pd.read_csv(f"{BASE}/events.csv")
df_fights   = pd.read_csv(f"{BASE}/fights.csv")
df_totals   = pd.read_csv(f"{BASE}/fight_totals.csv")
df_profiles = pd.read_csv(f"{BASE}/fighters_advanced.csv")
df_hist     = pd.read_csv(f"{BASE}/fighters_history_cleaned.csv")


In [113]:
def safe_split(pair):
    """Split 'A B' and convert to ints, replacing '--' with 0."""
    a, b = pair.split()
    a = 0 if a == "--" else int(a)
    b = 0 if b == "--" else int(b)
    return a, b


rows = []

fight_cols = [
    "event_id", "event_name", "event_url",
    "fight_order", "fight_url",
    "weight_class", "method",
    "round_end", "time_end", "scheduled_rounds",
               # üîπ include date here
]

for _, r in df_fights.iterrows():

    # Parse stats safely
    KD_A, KD_B  = safe_split(r.KD)
    STR_A, STR_B = safe_split(r.STR)
    TD_A, TD_B  = safe_split(r.TD)
    SUB_A, SUB_B = safe_split(r.SUB)

    # ---- Fighter A row ----
    rows.append({
        "fight_url": r.fight_url,
        "fighter":   r.fighter_A,
        "opponent":  r.fighter_B,
        "result": (
            "win"  if r.WL_label == "win" else
            "draw" if "draw" in r.WL_label.lower() else
            "nc"
        ),
        **{c: r[c] for c in fight_cols},
        "KD":  KD_A,
        "STR": STR_A,
        "TD":  TD_A,
        "SUB": SUB_A,
    })

    # ---- Fighter B row ----
    rows.append({
        "fight_url": r.fight_url,
        "fighter":   r.fighter_B,
        "opponent":  r.fighter_A,
        "result": (
            "loss" if r.WL_label == "win" else
            "draw" if "draw" in r.WL_label.lower() else
            "nc"
        ),
        **{c: r[c] for c in fight_cols},
        "KD":  KD_B,
        "STR": STR_B,
        "TD":  TD_B,
        "SUB": SUB_B,
    })

df_fight_clean = pd.DataFrame(rows)


In [114]:
df_fight_clean

Unnamed: 0,fight_url,fighter,opponent,result,event_id,event_name,event_url,fight_order,weight_class,method,round_end,time_end,scheduled_rounds,KD,STR,TD,SUB
0,http://www.ufcstats.com/fight-details/4a0db214...,Petr Yan,Merab Dvalishvili,win,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,1,Bantamweight,U-DEC,5,5:00,5,0,139,5,0
1,http://www.ufcstats.com/fight-details/4a0db214...,Merab Dvalishvili,Petr Yan,loss,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,1,Bantamweight,U-DEC,5,5:00,5,0,134,2,2
2,http://www.ufcstats.com/fight-details/dfa692db...,Joshua Van,Alexandre Pantoja,win,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,2,Flyweight,KO/TKO,1,0:26,3,0,2,0,0
3,http://www.ufcstats.com/fight-details/dfa692db...,Alexandre Pantoja,Joshua Van,loss,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,2,Flyweight,KO/TKO,1,0:26,3,0,6,0,0
4,http://www.ufcstats.com/fight-details/fbbb9e72...,Tatsuro Taira,Brandon Moreno,win,bd92cf5da5413d2a,UFC 323: Dvalishvili vs. Yan 2,http://www.ufcstats.com/event-details/bd92cf5d...,3,Flyweight,KO/TKO Punches,2,2:24,3,0,28,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16959,http://www.ufcstats.com/fight-details/ccee020b...,David Levicki,Johnny Rhodes,loss,a6a9ab5a824e8f66,UFC 2: No Way Out,http://www.ufcstats.com/event-details/a6a9ab5a...,13,Open Weight,KO/TKO Punches,1,12:13,3,0,4,0,0
16960,http://www.ufcstats.com/fight-details/4b9ae533...,Patrick Smith,Ray Wizard,win,a6a9ab5a824e8f66,UFC 2: No Way Out,http://www.ufcstats.com/event-details/a6a9ab5a...,14,Open Weight,SUB Guillotine Choke,1,0:58,3,0,1,0,1
16961,http://www.ufcstats.com/fight-details/4b9ae533...,Ray Wizard,Patrick Smith,loss,a6a9ab5a824e8f66,UFC 2: No Way Out,http://www.ufcstats.com/event-details/a6a9ab5a...,14,Open Weight,SUB Guillotine Choke,1,0:58,3,0,1,0,0
16962,http://www.ufcstats.com/fight-details/4acab678...,Scott Morris,Sean Daugherty,win,a6a9ab5a824e8f66,UFC 2: No Way Out,http://www.ufcstats.com/event-details/a6a9ab5a...,15,Open Weight,SUB Guillotine Choke,1,0:20,3,0,1,1,1


In [115]:
stats_rows = []

for _, r in df_totals.iterrows():
    
    # ----- Fighter A -----
    stats_rows.append({
        "fight_url": r.fight_url,
        "fighter": r.fighter_A,
        "KD": r.KD_A,
        "SigStr": r["Sig Str_A"],
        "SigStrPct": r["Sig Str %_A"],
        "TotalStr": r["Total Str_A"],
        "Td": r["Td_A"],
        "TdPct": r["Td %_A"],
        "SubAtt": r["Sub Att_A"],
        "Rev": r["Rev_A"],
        "Ctrl": r["Ctrl_A"],
        "Head": r["Head_A"],
        "Body": r["Body_A"],
        "Leg": r["Leg_A"],
        "Distance": r["Distance_A"],
        "Clinch": r["Clinch_A"],
        "Ground": r["Ground_A"],
        "per_round_totals": r["per_round_totals"]
    })

    # ----- Fighter B -----
    stats_rows.append({
        "fight_url": r.fight_url,
        "fighter": r.fighter_B,
        "KD": r.KD_B,
        "SigStr": r["Sig Str_B"],
        "SigStrPct": r["Sig Str %_B"],
        "TotalStr": r["Total Str_B"],
        "Td": r["Td_B"],
        "TdPct": r["Td %_B"],
        "SubAtt": r["Sub Att_B"],
        "Rev": r["Rev_B"],
        "Ctrl": r["Ctrl_B"],
        "Head": r["Head_B"],
        "Body": r["Body_B"],
        "Leg": r["Leg_B"],
        "Distance": r["Distance_B"],
        "Clinch": r["Clinch_B"],
        "Ground": r["Ground_B"],
        "per_round_totals": r["per_round_totals"]
    })

df_stats_long = pd.DataFrame(stats_rows)


In [116]:
df_stats_long

Unnamed: 0,fight_url,fighter,KD,SigStr,SigStrPct,TotalStr,Td,TdPct,SubAtt,Rev,Ctrl,Head,Body,Leg,Distance,Clinch,Ground,per_round_totals
0,http://www.ufcstats.com/fight-details/4a0db214...,Merab Dvalishvili,0,134 of 383,34%,196 of 458,2 of 29,6%,2,1,5:12,105 of 341,22 of 34,7 of 8,116 of 353,18 of 30,0 of 0,"[{'round': 'Round 1', 'KD_A': '0', 'KD_B': '0'..."
1,http://www.ufcstats.com/fight-details/4a0db214...,Petr Yan,0,139 of 230,60%,159 of 251,5 of 9,55%,0,0,2:55,109 of 195,17 of 19,13 of 16,119 of 204,18 of 24,2 of 2,"[{'round': 'Round 1', 'KD_A': '0', 'KD_B': '0'..."
2,http://www.ufcstats.com/fight-details/dfa692db...,Alexandre Pantoja,0,6 of 11,54%,6 of 11,0 of 0,---,0,0,0:00,4 of 8,1 of 1,1 of 2,5 of 10,1 of 1,0 of 0,"[{'round': 'Round 1', 'KD_A': '0', 'KD_B': '0'..."
3,http://www.ufcstats.com/fight-details/dfa692db...,Joshua Van,0,2 of 4,50%,2 of 4,0 of 0,---,0,0,0:00,2 of 4,0 of 0,0 of 0,2 of 3,0 of 1,0 of 0,"[{'round': 'Round 1', 'KD_A': '0', 'KD_B': '0'..."
4,http://www.ufcstats.com/fight-details/fbbb9e72...,Brandon Moreno,0,9 of 17,52%,9 of 17,0 of 0,---,1,0,0:05,8 of 14,0 of 2,1 of 1,7 of 15,0 of 0,2 of 2,"[{'round': 'Round 1', 'KD_A': '0', 'KD_B': '0'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16917,http://www.ufcstats.com/fight-details/ccee020b...,David Levicki,0,4 of 5,80%,95 of 102,0 of 0,---,0,0,--,4 of 5,0 of 0,0 of 0,1 of 2,2 of 2,1 of 1,"[{'round': 'Round 1', 'KD_A': '0', 'KD_B': '0'..."
16918,http://www.ufcstats.com/fight-details/4b9ae533...,Patrick Smith,0,1 of 1,100%,1 of 1,0 of 1,0%,1,0,--,0 of 0,1 of 1,0 of 0,0 of 0,1 of 1,0 of 0,"[{'round': 'Round 1', 'KD_A': '0', 'KD_B': '0'..."
16919,http://www.ufcstats.com/fight-details/4b9ae533...,Ray Wizard,0,1 of 1,100%,2 of 2,0 of 0,---,0,0,--,0 of 0,0 of 0,1 of 1,1 of 1,0 of 0,0 of 0,"[{'round': 'Round 1', 'KD_A': '0', 'KD_B': '0'..."
16920,http://www.ufcstats.com/fight-details/4acab678...,Scott Morris,0,1 of 1,100%,2 of 2,1 of 1,100%,1,0,--,1 of 1,0 of 0,0 of 0,0 of 0,1 of 1,0 of 0,"[{'round': 'Round 1', 'KD_A': '0', 'KD_B': '0'..."


In [117]:
def parse_pair(x):
    if pd.isna(x): 
        return (np.nan, np.nan)
    x = str(x).strip()
    # Format "35 of 67"
    if "of" in x:
        a, b = x.split("of")
        return (int(a.strip()), int(b.strip()))
    # Format "35 67"
    nums = x.split()
    if len(nums) == 2:
        return (int(nums[0]), int(nums[1]))
    return (np.nan, np.nan)

def parse_int(x):
    try: return int(str(x).strip())
    except: return np.nan

def parse_float(x):
    try: return float(str(x).strip().replace("%",""))
    except: return np.nan
    
def parse_time(t):
    # "3:55" ‚Üí seconds
    if isinstance(t, str) and ":" in t:
        m, s = t.split(":")
        return int(m)*60 + int(s)
    return np.nan


In [118]:
# KD
df_stats_long["KD"] = df_stats_long["KD"].apply(parse_int)

# Sig Str
df_stats_long["SigStr_Landed"], df_stats_long["SigStr_Att"] = zip(
    *df_stats_long["SigStr"].apply(parse_pair)
)

# Sig Str %
df_stats_long["SigStrPct"] = df_stats_long["SigStrPct"].apply(parse_float)

# Total Strikes
df_stats_long["TotalStr_Landed"], df_stats_long["TotalStr_Att"] = zip(
    *df_stats_long["TotalStr"].apply(parse_pair)
)

# Takedowns
df_stats_long["Td"] = df_stats_long["Td"].apply(parse_int)

# Takedown %
df_stats_long["TdPct"] = df_stats_long["TdPct"].apply(parse_float)

# Sub Attempts
df_stats_long["SubAtt"] = df_stats_long["SubAtt"].apply(parse_int)

# Reversals
df_stats_long["Rev"] = df_stats_long["Rev"].apply(parse_int)

# Control time to seconds
df_stats_long["Ctrl"] = df_stats_long["Ctrl"].apply(parse_time)


In [119]:
# Head strikes
df_stats_long["Head_Landed"], df_stats_long["Head_Att"] = zip(
    *df_stats_long["Head"].apply(parse_pair)
)

# Body strikes
df_stats_long["Body_Landed"], df_stats_long["Body_Att"] = zip(
    *df_stats_long["Body"].apply(parse_pair)
)

# Leg strikes
df_stats_long["Leg_Landed"], df_stats_long["Leg_Att"] = zip(
    *df_stats_long["Leg"].apply(parse_pair)
)

# Distance strikes
df_stats_long["Dist_Landed"], df_stats_long["Dist_Att"] = zip(
    *df_stats_long["Distance"].apply(parse_pair)
)

# Clinch strikes
df_stats_long["Clinch_Landed"], df_stats_long["Clinch_Att"] = zip(
    *df_stats_long["Clinch"].apply(parse_pair)
)

# Ground strikes
df_stats_long["Ground_Landed"], df_stats_long["Ground_Att"] = zip(
    *df_stats_long["Ground"].apply(parse_pair)
)



In [120]:
df_totals.columns

Index(['fight_url', 'fighter_A', 'fighter_B', 'KD_A', 'KD_B', 'Sig Str_A',
       'Sig Str_B', 'Sig Str %_A', 'Sig Str %_B', 'Total Str_A', 'Total Str_B',
       'Td_A', 'Td_B', 'Td %_A', 'Td %_B', 'Sub Att_A', 'Sub Att_B', 'Rev_A',
       'Rev_B', 'Ctrl_A', 'Ctrl_B', 'Head_A', 'Head_B', 'Body_A', 'Body_B',
       'Leg_A', 'Leg_B', 'Distance_A', 'Distance_B', 'Clinch_A', 'Clinch_B',
       'Ground_A', 'Ground_B', 'per_round_totals'],
      dtype='object')

In [121]:
# ============================================================
# ADVANCED GLICKO-2 ENGINE ‚Äî COMPATIBLE WITH LONG FORMAT df_fight_clean
# ============================================================

import numpy as np
import pandas as pd
from math import sqrt, pi
import time

# ============================================================
# 1. HELPER FUNCTIONS
# ============================================================
def _g(RD):
    return 1 / np.sqrt(1 + 3*(RD**2) / (pi**2))

def _E(mu, mu_j, phi_j):
    return 1 / (1 + np.exp(-_g(phi_j) * (mu - mu_j)))


# ============================================================
# 2. FIGHTER CLASS
# ============================================================
class Fighter:
    def __init__(self, name, rating=1500, RD=350, volatility=0.06):
        self.name = name
        self.rating = rating
        self.RD = RD
        self.vol = volatility

    def update_player(self, opp_ratings, opp_RDs, scores):
        if len(opp_ratings) == 0:
            return

        mu  = (self.rating - 1500) / 173.7178
        phi = self.RD / 173.7178
        sigma = self.vol

        v_inv = 0
        delta_sum = 0

        for r_j, RD_j, s_j in zip(opp_ratings, opp_RDs, scores):
            mu_j  = (r_j - 1500) / 173.7178
            phi_j = RD_j / 173.7178

            g = _g(phi_j)
            E = _E(mu, mu_j, phi_j)

            v_inv += (g * g) * E * (1 - E)
            delta_sum += g * (s_j - E)

        v = 1 / v_inv

        mu_prime = mu + (phi*phi / (phi*phi + sigma*sigma + v)) * delta_sum

        phi_star = sqrt(phi*phi + sigma*sigma)
        phi_prime = 1 / sqrt(1/(phi_star*phi_star) + 1/v)

        self.rating = 173.7178 * mu_prime + 1500
        self.RD = 173.7178 * phi_prime


# ============================================================
# 3. LOAD HISTORY DATES FOR MERGE
# ============================================================
df_hist = pd.read_csv(f"{BASE}/fighters_history_cleaned.csv")
df_hist = df_hist[df_hist["event"].str.contains("UFC", na=False)]
df_hist["date"] = pd.to_datetime(df_hist["date"])

df_hist_dates = df_hist[["fighter_name", "event_url", "date"]].drop_duplicates()


# ============================================================
# 4. MERGE DATE INTO df_fight_clean
# ============================================================
df_fight_clean = df_fight_clean.merge(
    df_hist_dates.rename(columns={"fighter_name": "fighter"}),
    on=["fighter", "event_url"],
    how="left"
)

df_fight_clean = df_fight_clean.sort_values(["date", "fight_url"]).reset_index(drop=True)


# ============================================================
# 5. INIT FIGHTER OBJECTS
# ============================================================
all_fighters = pd.concat([
    df_fight_clean["fighter"],
    df_fight_clean["opponent"]
]).unique()

fighters = {name: Fighter(name) for name in all_fighters}


rows_for_glicko_table = []


# ============================================================
# 6. MAIN GLICKO UPDATE LOOP ‚Äî FIXED FOR LONG FORMAT
# ============================================================

avg_rating = 1500
start = time.time()

for idx, row in df_fight_clean.iterrows():

    A = row["fighter"]
    B = row["opponent"]

    fA = fighters[A]
    fB = fighters[B]

    # Save BEFORE values
    rows_for_glicko_table.append({
        "fight_url": row.fight_url,
        "event_url": row.event_url,
        "date": row.date,
        "fighter": A,
        "opponent": B,
        "rating_before": fA.rating,
        "RD_before": fA.RD
    })

    # Convert result into numeric score
    r = row["result"]
    sA = 1 if r == "win" else 0 if r == "loss" else 0.5
    sB = 1 - sA if r in ["win", "loss"] else 0.5

    # We need opponent row ‚Äî guaranteed because 2 rows per fight
    opp_row = df_fight_clean[
        (df_fight_clean.fight_url == row.fight_url) &
        (df_fight_clean.fighter == B)
    ].iloc[0]

    # -----------------------------
    # BASIC GLICKO: Use sA and sB ONLY
    # (Remove advanced score dependencies for now)
    # -----------------------------

    scoreA = sA
    scoreB = sB

    # Update ratings
    fA.update_player([fB.rating], [fB.RD], [scoreA])
    fB.update_player([fA.rating], [fA.RD], [scoreB])

print("DONE ‚Äî Time:", time.time() - start)


# ============================================================
# 7. SAVE Final Table
# ============================================================
df_glicko = pd.DataFrame(rows_for_glicko_table)
df_glicko.to_csv(f"{BASE}/glicko_ratings.csv", index=False)

print("Saved glicko_ratings.csv ‚Äî shape:", df_glicko.shape)


DONE ‚Äî Time: 22.051298141479492
Saved glicko_ratings.csv ‚Äî shape: (16964, 7)


In [122]:
# KD
df_totals["KD_A"] = df_totals["KD_A"].apply(parse_int)
df_totals["KD_B"] = df_totals["KD_B"].apply(parse_int)

# Sig Str
df_totals["SigStr_A_L"], df_totals["SigStr_A_Att"] = zip(*df_totals["Sig Str_A"].apply(parse_pair))
df_totals["SigStr_B_L"], df_totals["SigStr_B_Att"] = zip(*df_totals["Sig Str_B"].apply(parse_pair))

# Sig Str %
df_totals["SigStrPct_A"] = df_totals["Sig Str %_A"].apply(parse_float)
df_totals["SigStrPct_B"] = df_totals["Sig Str %_B"].apply(parse_float)

# Total Strikes
df_totals["TotStr_A_L"], df_totals["TotStr_A_Att"] = zip(*df_totals["Total Str_A"].apply(parse_pair))
df_totals["TotStr_B_L"], df_totals["TotStr_B_Att"] = zip(*df_totals["Total Str_B"].apply(parse_pair))

# Takedowns
df_totals["Td_A_L"], df_totals["Td_A_Att"] = zip(*df_totals["Td_A"].apply(parse_pair))
df_totals["Td_B_L"], df_totals["Td_B_Att"] = zip(*df_totals["Td_B"].apply(parse_pair))


# Takedown %
df_totals["TdPct_A"] = df_totals["Td %_A"].apply(parse_float)
df_totals["TdPct_B"] = df_totals["Td %_B"].apply(parse_float)

# Sub Attempts
df_totals["SubAtt_A"] = df_totals["Sub Att_A"].apply(parse_int)
df_totals["SubAtt_B"] = df_totals["Sub Att_B"].apply(parse_int)

# Reversals
df_totals["Rev_A"] = df_totals["Rev_A"].apply(parse_int)
df_totals["Rev_B"] = df_totals["Rev_B"].apply(parse_int)

# Control (mm:ss to seconds)
df_totals["Ctrl_A"] = df_totals["Ctrl_A"].apply(parse_time)
df_totals["Ctrl_B"] = df_totals["Ctrl_B"].apply(parse_time)

# Head strikes
df_totals["Head_A_L"], df_totals["Head_A_Att"] = zip(*df_totals["Head_A"].apply(parse_pair))
df_totals["Head_B_L"], df_totals["Head_B_Att"] = zip(*df_totals["Head_B"].apply(parse_pair))

# Body
df_totals["Body_A_L"], df_totals["Body_A_Att"] = zip(*df_totals["Body_A"].apply(parse_pair))
df_totals["Body_B_L"], df_totals["Body_B_Att"] = zip(*df_totals["Body_B"].apply(parse_pair))

# Leg
df_totals["Leg_A_L"], df_totals["Leg_A_Att"] = zip(*df_totals["Leg_A"].apply(parse_pair))
df_totals["Leg_B_L"], df_totals["Leg_B_Att"] = zip(*df_totals["Leg_B"].apply(parse_pair))

# Distance
df_totals["Dist_A_L"], df_totals["Dist_A_Att"] = zip(*df_totals["Distance_A"].apply(parse_pair))
df_totals["Dist_B_L"], df_totals["Dist_B_Att"] = zip(*df_totals["Distance_B"].apply(parse_pair))

# Clinch
df_totals["Clinch_A_L"], df_totals["Clinch_A_Att"] = zip(*df_totals["Clinch_A"].apply(parse_pair))
df_totals["Clinch_B_L"], df_totals["Clinch_B_Att"] = zip(*df_totals["Clinch_B"].apply(parse_pair))

# Ground
df_totals["Ground_A_L"], df_totals["Ground_A_Att"] = zip(*df_totals["Ground_A"].apply(parse_pair))
df_totals["Ground_B_L"], df_totals["Ground_B_Att"] = zip(*df_totals["Ground_B"].apply(parse_pair))


In [123]:
# ------------------------------------------------------------
# 2. MERGE CLEAN TOTALS INTO df_fight_clean
# ------------------------------------------------------------
df = df_fight_clean.copy()

df = df.merge(df_totals.copy(), on="fight_url", how="left")


# ------------------------------------------------------------
# 3. Identify fighter side (A or B)
# ------------------------------------------------------------
df["is_A"] = (df["fighter"] == df["fighter_A"])

def pick(A, B):
    return np.where(df["is_A"], df[A], df[B])


# ------------------------------------------------------------
# 4. PICK FIGHTER + OPPONENT VALUES (numeric only!)
# ------------------------------------------------------------

# Fighter
df["KD_f"]        = pick("KD_A", "KD_B")
df["SigStr_f"]    = pick("SigStr_A_L", "SigStr_B_L")
df["TD_f"]        = pick("Td_A_L", "Td_B_L")
df["Ctrl_f"]      = pick("Ctrl_A", "Ctrl_B")
df["Head_f"]      = pick("Head_A_L", "Head_B_L")
df["Body_f"]      = pick("Body_A_L", "Body_B_L")
df["Leg_f"]       = pick("Leg_A_L", "Leg_B_L")

# Opponent
df["KD_o"]        = pick("KD_B", "KD_A")
df["SigStr_o"]    = pick("SigStr_B_L", "SigStr_A_L")
df["TD_o"]        = pick("Td_B_L", "Td_A_L")
df["Ctrl_o"]      = pick("Ctrl_B", "Ctrl_A")
df["Head_o"]      = pick("Head_B_L", "Head_A_L")
df["Body_o"]      = pick("Body_B_L", "Body_A_L")
df["Leg_o"]       = pick("Leg_B_L", "Leg_A_L")


# ------------------------------------------------------------
# 5. DIFFERENCES ‚Äî now works perfectly
# ------------------------------------------------------------
df["KD_diff"]       = df["KD_f"] - df["KD_o"]
df["SigStr_diff"]   = df["SigStr_f"] - df["SigStr_o"]
df["TD_diff"]       = df["TD_f"] - df["TD_o"]
df["Ctrl_diff"]     = df["Ctrl_f"] - df["Ctrl_o"]
df["Head_diff"]     = df["Head_f"] - df["Head_o"]
df["Body_diff"]     = df["Body_f"] - df["Body_o"]
df["Leg_diff"]      = df["Leg_f"] - df["Leg_o"]


# ------------------------------------------------------------
# 6. Final cleanup
# ------------------------------------------------------------
df = df.sort_values(["event_id", "fight_order"]).reset_index(drop=True)

print("Final df shape:", df.shape)
df.head()


Final df shape: (16964, 115)


Unnamed: 0,fight_url,fighter,opponent,result,event_id,event_name,event_url,fight_order,weight_class,method,...,Head_o,Body_o,Leg_o,KD_diff,SigStr_diff,TD_diff,Ctrl_diff,Head_diff,Body_diff,Leg_diff
0,http://www.ufcstats.com/fight-details/810c9da0...,Ciryl Gane,Tai Tuivasa,win,00a905a4a4a2b071,UFC Fight Night: Gane vs. Tuivasa,http://www.ufcstats.com/event-details/00a905a4...,1,Heavyweight,KO/TKO Punches,...,17.0,4.0,8.0,0.0,81.0,0.0,-3.0,47.0,26.0,8.0
1,http://www.ufcstats.com/fight-details/810c9da0...,Tai Tuivasa,Ciryl Gane,loss,00a905a4a4a2b071,UFC Fight Night: Gane vs. Tuivasa,http://www.ufcstats.com/event-details/00a905a4...,1,Heavyweight,KO/TKO Punches,...,64.0,30.0,16.0,0.0,-81.0,0.0,3.0,-47.0,-26.0,-8.0
2,http://www.ufcstats.com/fight-details/b8ca1acd...,Robert Whittaker,Marvin Vettori,win,00a905a4a4a2b071,UFC Fight Night: Gane vs. Tuivasa,http://www.ufcstats.com/event-details/00a905a4...,2,Middleweight,U-DEC,...,15.0,7.0,11.0,0.0,41.0,1.0,-16.0,34.0,-1.0,8.0
3,http://www.ufcstats.com/fight-details/b8ca1acd...,Marvin Vettori,Robert Whittaker,loss,00a905a4a4a2b071,UFC Fight Night: Gane vs. Tuivasa,http://www.ufcstats.com/event-details/00a905a4...,2,Middleweight,U-DEC,...,49.0,6.0,19.0,0.0,-41.0,-1.0,16.0,-34.0,1.0,-8.0
4,http://www.ufcstats.com/fight-details/20f316f9...,Nassourdine Imavov,Joaquin Buckley,win,00a905a4a4a2b071,UFC Fight Night: Gane vs. Tuivasa,http://www.ufcstats.com/event-details/00a905a4...,3,Middleweight,U-DEC,...,27.0,8.0,11.0,0.0,5.0,2.0,207.0,13.0,1.0,-9.0


In [124]:
# ============================================================
# MERGE FIGHTER PROFILES BY NAME
# ============================================================

prof = df_profiles[[
    "name",
    "height",
    "weight",
    "reach",
    "stance",
    "dob",
    "SLpM",
    "Str_Acc",
    "SApM",
    "Str_Def",
    "TD_Avg",
    "TD_Acc",
    "TD_Def",
    "Sub_Avg"
]].rename(columns={
    "name": "fighter"
})

df = df.merge(
    prof,
    on="fighter",
    how="left"
)

# ============================================================
# MERGE OPPONENT PROFILES
# ============================================================

opp_prof = df_profiles[[
    "name",
    "height",
    "weight",
    "reach",
    "stance",
    "dob",
    "SLpM",
    "Str_Acc",
    "SApM",
    "Str_Def",
    "TD_Avg",
    "TD_Acc",
    "TD_Def",
    "Sub_Avg"
]].rename(columns={
    "name": "opponent",
    "height": "opp_height",
    "weight": "opp_weight",
    "reach": "opp_reach",
    "stance": "opp_stance",
    "dob": "opp_dob",
    "SLpM": "opp_SLpM",
    "Str_Acc": "opp_Str_Acc",
    "SApM": "opp_SApM",
    "Str_Def": "opp_Str_Def",
    "TD_Avg": "opp_TD_Avg",
    "TD_Acc": "opp_TD_Acc",
    "TD_Def": "opp_TD_Def",
    "Sub_Avg": "opp_Sub_Avg"
})

df = df.merge(
    opp_prof,
    on="opponent",
    how="left"
)


In [125]:
df.shape

(17006, 141)

In [126]:
df.result.value_counts()

result
win     8352
loss    8352
nc       180
draw     122
Name: count, dtype: int64

In [127]:
# Convert DOB columns
df["dob"] = pd.to_datetime(df["dob"], errors="coerce")
df["opp_dob"] = pd.to_datetime(df["opp_dob"], errors="coerce")

# Compute ages on fight date
df["fighter_age"] = (df["date"] - df["dob"]).dt.days / 365.25
df["opponent_age"] = (df["date"] - df["opp_dob"]).dt.days / 365.25

# Age difference (fighter - opponent)
df["age_diff"] = df["fighter_age"] - df["opponent_age"]


In [128]:
df_glicko

Unnamed: 0,fight_url,event_url,date,fighter,opponent,rating_before,RD_before
0,http://www.ufcstats.com/fight-details/00835554...,http://www.ufcstats.com/event-details/a6a9ab5a...,1994-03-11,Royce Gracie,Patrick Smith,1500.000000,350.000000
1,http://www.ufcstats.com/fight-details/00835554...,http://www.ufcstats.com/event-details/a6a9ab5a...,1994-03-11,Patrick Smith,Royce Gracie,1478.260649,281.384239
2,http://www.ufcstats.com/fight-details/17ee4caf...,http://www.ufcstats.com/event-details/a6a9ab5a...,1994-03-11,Remco Pardoel,Alberta Cerra Leon,1500.000000,350.000000
3,http://www.ufcstats.com/fight-details/17ee4caf...,http://www.ufcstats.com/event-details/a6a9ab5a...,1994-03-11,Alberta Cerra Leon,Remco Pardoel,1478.260649,281.384239
4,http://www.ufcstats.com/fight-details/3b020d49...,http://www.ufcstats.com/event-details/a6a9ab5a...,1994-03-11,Orlando Wiet,Robert Lucarelli,1500.000000,350.000000
...,...,...,...,...,...,...,...
16959,http://www.ufcstats.com/fight-details/fe6b45e7...,http://www.ufcstats.com/event-details/05fbfe62...,NaT,Brian Ortega,Mike de la Torre,1634.537142,75.316882
16960,http://www.ufcstats.com/fight-details/febbd4eb...,http://www.ufcstats.com/event-details/cd42bbe8...,NaT,Dooho Choi,Thiago Tavares,1526.844500,94.274619
16961,http://www.ufcstats.com/fight-details/febbd4eb...,http://www.ufcstats.com/event-details/cd42bbe8...,NaT,Thiago Tavares,Dooho Choi,1544.461726,74.024749
16962,http://www.ufcstats.com/fight-details/fee1d48d...,http://www.ufcstats.com/event-details/577ec7e1...,NaT,Nate Marquardt,Crafton Wallace,1582.778635,65.785098


In [129]:
# ============================================================
# CHUNK 1 ‚Äî MERGE FIGHTER SIDE + CLEAN BEFORE OPP MERGE
# ============================================================

# Sort for safety
df_glicko = df_glicko.sort_values(["fight_url", "fighter"]).reset_index(drop=True)
df = df.sort_values(["fight_url", "fighter"]).reset_index(drop=True)

# ---------------------------
# 1. MERGE FIGHTER SIDE
# ---------------------------
df = df.merge(
    df_glicko.rename(columns={
        "rating_before": "g_rating_before",
        "RD_before": "g_RD_before"
    }),
    on=["fight_url", "fighter"],
    how="left"
)

# ---------------------------
# 2. CLEAN UP LEFTOVER COLUMNS
# ---------------------------
cols_to_drop = [
    "opponent_y", "event_url_y", "date_y",
    "g_RD_before_y", "g_rating_before_y"
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")

df = df.rename(columns={
    "opponent_x": "opponent",
    "event_url_x": "event_url",
    "date_x": "date"
})

# Remove duplicate columns
df = df.loc[:, ~df.columns.duplicated()]


In [130]:
if "opponent" in df_glicko.columns:
    df_glicko = df_glicko.drop(columns=["opponent"])


In [131]:
# ============================================================
# CHUNK 2 ‚Äî MERGE OPPONENT SIDE + DIFF FEATURES
# ============================================================

# 3. MERGE OPPONENT SIDE
df = df.merge(
    df_glicko.rename(columns={
        "fighter": "opponent",
        "rating_before": "opp_rating_before",
        "RD_before": "opp_RD_before"
    }),
    on=["fight_url", "opponent"],
    how="left"
)

# 4. COMPUTE DIFFERENCES
df["rating_diff"] = df["g_rating_before"] - df["opp_rating_before"]
df["RD_diff"] = df["g_RD_before"] - df["opp_RD_before"]


In [132]:
df.columns.tolist()

['fight_url',
 'fighter',
 'opponent',
 'result',
 'event_id',
 'event_name',
 'event_url_x',
 'fight_order',
 'weight_class',
 'method',
 'round_end',
 'time_end',
 'scheduled_rounds',
 'KD',
 'STR',
 'TD',
 'SUB',
 'date_x',
 'fighter_A',
 'fighter_B',
 'KD_A',
 'KD_B',
 'Sig Str_A',
 'Sig Str_B',
 'Sig Str %_A',
 'Sig Str %_B',
 'Total Str_A',
 'Total Str_B',
 'Td_A',
 'Td_B',
 'Td %_A',
 'Td %_B',
 'Sub Att_A',
 'Sub Att_B',
 'Rev_A',
 'Rev_B',
 'Ctrl_A',
 'Ctrl_B',
 'Head_A',
 'Head_B',
 'Body_A',
 'Body_B',
 'Leg_A',
 'Leg_B',
 'Distance_A',
 'Distance_B',
 'Clinch_A',
 'Clinch_B',
 'Ground_A',
 'Ground_B',
 'per_round_totals',
 'SigStr_A_L',
 'SigStr_A_Att',
 'SigStr_B_L',
 'SigStr_B_Att',
 'SigStrPct_A',
 'SigStrPct_B',
 'TotStr_A_L',
 'TotStr_A_Att',
 'TotStr_B_L',
 'TotStr_B_Att',
 'Td_A_L',
 'Td_A_Att',
 'Td_B_L',
 'Td_B_Att',
 'TdPct_A',
 'TdPct_B',
 'SubAtt_A',
 'SubAtt_B',
 'Head_A_L',
 'Head_A_Att',
 'Head_B_L',
 'Head_B_Att',
 'Body_A_L',
 'Body_A_Att',
 'Body_B_L',
 'B

In [133]:
# Keep fighter-side event_url/date
df = df.rename(columns={
    "event_url_x": "event_url",
    "date_x": "date"
})

# Drop the opponent-side duplicates
df = df.drop(columns=["event_url_y", "date_y"])


In [134]:
import re
import numpy as np
import pandas as pd

# =========================
# 1. Helper functions
# =========================

def height_to_inches(h):
    if pd.isna(h):
        return np.nan
    h = str(h).strip()

    if h in ["--", "", "nan", "None"]:
        return np.nan

    m = re.match(r"(\d+)'\s*(\d+)", h)
    if m:
        feet = int(m.group(1))
        inches = int(m.group(2))
        return feet * 12 + inches
    
    return np.nan


def reach_to_inches(r):
    if pd.isna(r):
        return np.nan
    r = str(r).replace('"', '').replace(" ", "").strip()

    if r in ["--", "", "nan", "None"]:
        return np.nan

    return pd.to_numeric(r, errors="coerce")


# =========================
# 2. Convert heights
# =========================

df["fighter_height_inches"]  = df["height"].apply(height_to_inches)
df["opponent_height_inches"] = df["opp_height"].apply(height_to_inches)


# =========================
# 3. Convert reach
# =========================

df["fighter_reach_inches"]  = df["reach"].apply(reach_to_inches)
df["opponent_reach_inches"] = df["opp_reach"].apply(reach_to_inches)



# =========================
# 5. Physical diffs
# =========================

df["height_diff"] = df["fighter_height_inches"] - df["opponent_height_inches"]
df["reach_diff"]  = df["fighter_reach_inches"]  - df["opponent_reach_inches"]


# =========================
# 6. Preview
# =========================

print(df[[
    "fighter", "opponent",
    "height_diff", "reach_diff"
]].head())


          fighter        opponent  height_diff  reach_diff
0      Holly Holm    Irene Aldana         -1.0         1.0
1    Irene Aldana      Holly Holm          1.0        -1.0
2       Josh Neer  Joshua Burkman          1.0         0.0
3  Joshua Burkman       Josh Neer         -1.0         0.0
4   Kazula Vargas  Paddy Pimblett         -2.0        -2.0


In [135]:
df.columns.tolist()

['fight_url',
 'fighter',
 'opponent',
 'result',
 'event_id',
 'event_name',
 'event_url',
 'fight_order',
 'weight_class',
 'method',
 'round_end',
 'time_end',
 'scheduled_rounds',
 'KD',
 'STR',
 'TD',
 'SUB',
 'date',
 'fighter_A',
 'fighter_B',
 'KD_A',
 'KD_B',
 'Sig Str_A',
 'Sig Str_B',
 'Sig Str %_A',
 'Sig Str %_B',
 'Total Str_A',
 'Total Str_B',
 'Td_A',
 'Td_B',
 'Td %_A',
 'Td %_B',
 'Sub Att_A',
 'Sub Att_B',
 'Rev_A',
 'Rev_B',
 'Ctrl_A',
 'Ctrl_B',
 'Head_A',
 'Head_B',
 'Body_A',
 'Body_B',
 'Leg_A',
 'Leg_B',
 'Distance_A',
 'Distance_B',
 'Clinch_A',
 'Clinch_B',
 'Ground_A',
 'Ground_B',
 'per_round_totals',
 'SigStr_A_L',
 'SigStr_A_Att',
 'SigStr_B_L',
 'SigStr_B_Att',
 'SigStrPct_A',
 'SigStrPct_B',
 'TotStr_A_L',
 'TotStr_A_Att',
 'TotStr_B_L',
 'TotStr_B_Att',
 'Td_A_L',
 'Td_A_Att',
 'Td_B_L',
 'Td_B_Att',
 'TdPct_A',
 'TdPct_B',
 'SubAtt_A',
 'SubAtt_B',
 'Head_A_L',
 'Head_A_Att',
 'Head_B_L',
 'Head_B_Att',
 'Body_A_L',
 'Body_A_Att',
 'Body_B_L',
 'Body_

In [136]:
# ==============================================================
#   UFC BASELINE LOGISTIC REGRESSION PIPELINE
#   (PREFIGHT ‚Ä¢ LEAKAGE-FREE ‚Ä¢ TIME-AWARE ‚Ä¢ CLIPPED + LOGGED)
# ==============================================================

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import TimeSeriesSplit


# --------------------------------------------------------------
# 0. START WITH MASTER DF
# --------------------------------------------------------------
print("Master rows:", len(df))


# --------------------------------------------------------------
# 1. Keep only win/loss rows
# --------------------------------------------------------------
df = df[df["result"].isin(["win", "loss"])].copy()
df["WL_label"] = df["result"]
df["target"] = (df["WL_label"] == "win").astype(int)
print("After removing NC/draw:", len(df))


# --------------------------------------------------------------
# 2. Remove fights missing Glicko ratings
# --------------------------------------------------------------
df = df.dropna(subset=["g_rating_before", "opp_rating_before"])
print("After dropping missing Glicko:", len(df))


# --------------------------------------------------------------
# 3. Fix date column
# --------------------------------------------------------------
df = df.dropna(subset=["date"])
df["date"] = pd.to_datetime(df["date"])
print("After dropping missing dates:", len(df))


# --------------------------------------------------------------
# 4. Remove pre-modern UFC (before 2005)
# --------------------------------------------------------------
df = df[df["date"] >= "2005-01-01"]
print("After filtering pre-2005:", len(df))


# --------------------------------------------------------------
# 5. Drop fighters with fewer than 2 fights
# --------------------------------------------------------------
valid_fighters = df["fighter"].value_counts()
valid_fighters = valid_fighters[valid_fighters >= 2].index

df = df[df["fighter"].isin(valid_fighters)]
df = df[df["opponent"].isin(valid_fighters)]
print("After removing 1-fight profiles:", len(df))


# --------------------------------------------------------------
# 6. TEMPORAL FEATURES (PREFIGHT ONLY)
# --------------------------------------------------------------
df = df.sort_values(["fighter", "date"])

df["fights_before"] = df.groupby("fighter").cumcount()
df["days_since_last_fight"] = df.groupby("fighter")["date"].diff().dt.days
df["wins_before"] = df.groupby("fighter")["target"].cumsum().shift(1)

df["win_rate_before"] = np.where(
    df["fights_before"] > 0,
    df["wins_before"] / df["fights_before"],
    0.0
)

df["recent_win_rate_3"] = (
    df.groupby("fighter")["target"].shift(1).rolling(3).mean()
)
df["recent_win_rate_5"] = (
    df.groupby("fighter")["target"].shift(1).rolling(5).mean()
)

df[[
    "days_since_last_fight",
    "recent_win_rate_3",
    "recent_win_rate_5"
]] = df[[
    "days_since_last_fight",
    "recent_win_rate_3",
    "recent_win_rate_5"
]].fillna(0)


# --------------------------------------------------------------
# 7. MERGE OPPONENT TEMPORAL FEATURES
# --------------------------------------------------------------
temporal_cols = [
    "fights_before",
    "days_since_last_fight",
    "win_rate_before",
    "recent_win_rate_3",
    "recent_win_rate_5",
]

opp_temp = (
    df[["fight_url", "fighter"] + temporal_cols]
      .rename(columns={"fighter": "opponent"})
)

opp_temp = opp_temp.rename(
    columns={c: f"opp_{c}" for c in temporal_cols}
)

df = df.merge(
    opp_temp,
    on=["fight_url", "opponent"],
    how="left"
)


# --------------------------------------------------------------
# 8. CLEAN PERCENTAGES
# --------------------------------------------------------------
percent_cols = [
    "Str_Acc", "Str_Def", "TD_Acc", "TD_Def",
    "opp_Str_Acc", "opp_Str_Def", "opp_TD_Acc", "opp_TD_Def"
]

for col in percent_cols:
    df[col] = (
        df[col].astype(str)
        .str.replace("%", "", regex=False)
        .replace(["None", ""], np.nan)
        .astype(float) / 100.0
    )


# --------------------------------------------------------------
# 9. CLEAN NUMERIC STATS
# --------------------------------------------------------------
num_cols = [
    "SLpM", "SApM", "TD_Avg", "Sub_Avg",
    "opp_SLpM", "opp_SApM", "opp_TD_Avg", "opp_Sub_Avg"
]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")


df.replace([np.inf, -np.inf], 0, inplace=True)


# --------------------------------------------------------------
# 10. RAW FEATURE SET
# --------------------------------------------------------------
raw_feature_cols = [
    "rating_diff", "RD_diff",
    "height_diff", "reach_diff", "age_diff",

    "SLpM", "SApM", "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg",

    "opp_SLpM", "opp_SApM", "opp_Str_Acc", "opp_Str_Def",
    "opp_TD_Avg", "opp_TD_Acc", "opp_TD_Def", "opp_Sub_Avg",

    "fights_before",
    "days_since_last_fight",
    "win_rate_before",
    "recent_win_rate_3",
    "recent_win_rate_5",

    "opp_fights_before",
    "opp_days_since_last_fight",
    "opp_win_rate_before",
    "opp_recent_win_rate_3",
    "opp_recent_win_rate_5",
]

df_model = df[raw_feature_cols + ["target", "date"]].fillna(0)


# --------------------------------------------------------------
# 11. TIME-BASED TRAIN / TEST SPLIT
# --------------------------------------------------------------
train = df_model[df_model["date"] < "2021-01-01"].copy()
test  = df_model[df_model["date"] >= "2021-01-01"].copy()

print("Train size:", len(train))
print("Test size:", len(test))


# --------------------------------------------------------------
# 12. QUANTILE CLIPPING (FIT ON TRAIN ONLY)
# --------------------------------------------------------------
clip_cols = [
    "days_since_last_fight",
    "opp_days_since_last_fight",
    "fights_before",
    "opp_fights_before",
]

for col in clip_cols:
    lo, hi = train[col].quantile([0.001, 0.999])
    train[col] = train[col].clip(lo, hi)
    test[col]  = test[col].clip(lo, hi)


# --------------------------------------------------------------
# 13. LOG TRANSFORM (AFTER CLIPPING)
# --------------------------------------------------------------
for col in clip_cols:
    train[f"log_{col}"] = np.log1p(train[col])
    test[f"log_{col}"]  = np.log1p(test[col])


# --------------------------------------------------------------
# 14. FINAL FEATURE SET (LOGGED TEMPORAL)
# --------------------------------------------------------------
feature_cols = [
    "rating_diff", "RD_diff",
    "height_diff", "reach_diff", "age_diff",

    "SLpM", "SApM", "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg",

    "opp_SLpM", "opp_SApM", "opp_Str_Acc", "opp_Str_Def",
    "opp_TD_Avg", "opp_TD_Acc", "opp_TD_Def", "opp_Sub_Avg",

    "log_fights_before",
    "log_days_since_last_fight",
    "win_rate_before",
    "recent_win_rate_3",
    "recent_win_rate_5",

    "log_opp_fights_before",
    "log_opp_days_since_last_fight",
    "opp_win_rate_before",
    "opp_recent_win_rate_3",
    "opp_recent_win_rate_5",
]

X_train = train[feature_cols]
y_train = train["target"]

X_test = test[feature_cols]
y_test = test["target"]


# --------------------------------------------------------------
# 15. TIME-SERIES CV (OOF AUC ESTIMATE)
# --------------------------------------------------------------
tscv = TimeSeriesSplit(n_splits=5)
oof_probs = np.zeros(len(X_train))

for i, (tr_idx, va_idx) in enumerate(tscv.split(X_train)):
    model = LogisticRegression(max_iter=3000)
    model.fit(X_train.iloc[tr_idx], y_train.iloc[tr_idx])
    oof_probs[va_idx] = model.predict_proba(X_train.iloc[va_idx])[:, 1]
    print(f"Fold {i+1} AUC: {roc_auc_score(y_train.iloc[va_idx], oof_probs[va_idx]):.4f}")

print("OOF AUC:", roc_auc_score(y_train, oof_probs))


# --------------------------------------------------------------
# 16. FINAL MODEL ‚Üí TEST
# --------------------------------------------------------------
final_logit = LogisticRegression(max_iter=3000)
final_logit.fit(X_train, y_train)

test_probs = final_logit.predict_proba(X_test)[:, 1]
test_preds = (test_probs >= 0.5).astype(int)

print("\n===== FINAL TEST RESULTS (POST-2021) =====")
print("Accuracy:", accuracy_score(y_test, test_preds))
print("AUC:", roc_auc_score(y_test, test_probs))


Master rows: 17006
After removing NC/draw: 16704
After dropping missing Glicko: 16704
After dropping missing dates: 16114
After filtering pre-2005: 15254
After removing 1-fight profiles: 14758
Train size: 9822
Test size: 5016
Fold 1 AUC: 0.8727
Fold 2 AUC: 0.8827
Fold 3 AUC: 0.8839
Fold 4 AUC: 0.8700
Fold 5 AUC: 0.8919
OOF AUC: 0.7639203229830631

===== FINAL TEST RESULTS (POST-2021) =====
Accuracy: 0.7523923444976076
AUC: 0.8359062801268794


### XGBOOST 

In [137]:
from scipy.stats import skew

skew_features = [
    "fights_before",
    "days_since_last_fight",
    "win_rate_before",
    "recent_win_rate_3",
    "recent_win_rate_5",
    "opp_fights_before",
    "opp_days_since_last_fight",
    "opp_win_rate_before",
    "opp_recent_win_rate_3",
    "opp_recent_win_rate_5",
]

train_df = df[df["date"] < "2021-01-01"]

skewness = (
    train_df[skew_features]
    .apply(lambda x: skew(x.dropna()))
    .sort_values(ascending=False)
)

print(skewness)


days_since_last_fight        5.052990
opp_days_since_last_fight    5.052990
fights_before                1.605737
opp_fights_before            1.605737
recent_win_rate_5            0.888874
opp_recent_win_rate_5        0.888874
recent_win_rate_3            0.520479
opp_recent_win_rate_3        0.520479
win_rate_before             -0.230062
opp_win_rate_before         -0.230062
dtype: float64


In [138]:
# ===============================================================
#   OPTUNA-TUNED XGBOOST (TSCV + EARLY STOPPING)
#   UFC PREFIGHT MODEL ‚Äî LEAKAGE-FREE + ROBUST TEMPORAL FEATURES
# ===============================================================

import numpy as np
import pandas as pd
import optuna
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from xgboost import XGBClassifier


# ===============================================================
# 1. RAW FEATURE SET
# ===============================================================

raw_feature_cols = [
    # Glicko
    "rating_diff", "RD_diff",

    # Physical
    "height_diff", "reach_diff", "age_diff",

    # Fighter A stats
    "SLpM", "SApM", "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg",

    # Fighter B stats
    "opp_SLpM", "opp_SApM", "opp_Str_Acc", "opp_Str_Def",
    "opp_TD_Avg", "opp_TD_Acc", "opp_TD_Def", "opp_Sub_Avg",

    # TEMPORAL (RAW ‚Äî will be transformed)
    "fights_before",
    "days_since_last_fight",
    "win_rate_before",
    "recent_win_rate_3",
    "recent_win_rate_5",

    "opp_fights_before",
    "opp_days_since_last_fight",
    "opp_win_rate_before",
    "opp_recent_win_rate_3",
    "opp_recent_win_rate_5",
]


# ===============================================================
# 2. TRAIN / TEST SPLIT (TIME-AWARE)
# ===============================================================

df_model = df[raw_feature_cols + ["target", "date"]].copy()
df_model[raw_feature_cols] = df_model[raw_feature_cols].fillna(0)

train = df_model[df_model["date"] < "2021-01-01"].copy()
test  = df_model[df_model["date"] >= "2021-01-01"].copy()


# ===============================================================
# 3. QUANTILE CLIPPING (FIT ON TRAIN ONLY)
# ===============================================================

clip_config = {
    "days_since_last_fight": (0.001, 0.999),
    "opp_days_since_last_fight": (0.001, 0.999),
    "fights_before": (0.001, 0.999),
    "opp_fights_before": (0.001, 0.999),
}

for col, (lo_q, hi_q) in clip_config.items():
    lo, hi = train[col].quantile([lo_q, hi_q])
    train[col] = train[col].clip(lo, hi)
    test[col]  = test[col].clip(lo, hi)


# ===============================================================
# 4. LOG TRANSFORMS (AFTER CLIPPING)
# ===============================================================

log_cols = [
    "days_since_last_fight",
    "opp_days_since_last_fight",
    "fights_before",
    "opp_fights_before",
]

for col in log_cols:
    train[f"log_{col}"] = np.log1p(train[col])
    test[f"log_{col}"]  = np.log1p(test[col])


# ===============================================================
# 5. FINAL FEATURE SET (REPLACE RAW WITH LOGGED)
# ===============================================================

feature_cols = [
    # Glicko
    "rating_diff", "RD_diff",

    # Physical
    "height_diff", "reach_diff", "age_diff",

    # Fighter A stats
    "SLpM", "SApM", "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg",

    # Fighter B stats
    "opp_SLpM", "opp_SApM", "opp_Str_Acc", "opp_Str_Def",
    "opp_TD_Avg", "opp_TD_Acc", "opp_TD_Def", "opp_Sub_Avg",

    # TEMPORAL (TRANSFORMED)
    "log_fights_before",
    "log_days_since_last_fight",
    "win_rate_before",
    "recent_win_rate_3",
    "recent_win_rate_5",

    "log_opp_fights_before",
    "log_opp_days_since_last_fight",
    "opp_win_rate_before",
    "opp_recent_win_rate_3",
    "opp_recent_win_rate_5",
]


X_train = train[feature_cols].values
y_train = train["target"].values

X_test  = test[feature_cols].values
y_test  = test["target"].values


# ===============================================================
# 6. OPTUNA OBJECTIVE (TIME SERIES CV)
# ===============================================================

def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 450, 700),
        "learning_rate": trial.suggest_float("learning_rate", 0.08, 0.16, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 4),
        "min_child_weight": trial.suggest_int("min_child_weight", 2, 4),
        "gamma": trial.suggest_float("gamma", 0.7, 1.6),
        "subsample": trial.suggest_float("subsample", 0.85, 0.98),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.65, 0.75),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.25, 0.55),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.20, 0.60),
        "eval_metric": "logloss",
        "tree_method": "hist",
        "random_state": 42,
    }

    tscv = TimeSeriesSplit(n_splits=3)
    aucs = []

    for tr_idx, va_idx in tscv.split(X_train):
        model = XGBClassifier(**params, early_stopping_rounds=50)
        model.fit(
            X_train[tr_idx], y_train[tr_idx],
            eval_set=[(X_train[va_idx], y_train[va_idx])],
            verbose=False
        )
        preds = model.predict_proba(X_train[va_idx])[:, 1]
        aucs.append(roc_auc_score(y_train[va_idx], preds))

    return np.mean(aucs)


# ===============================================================
# 7. RUN OPTUNA
# ===============================================================

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

best_params = study.best_params
best_params.update({
    "eval_metric": "logloss",
    "tree_method": "hist",
    "random_state": 42,
})

print("\n===== BEST OPTUNA PARAMS =====")
print(best_params)


# ===============================================================
# 8. OOF PREDICTIONS (5-FOLD TSCV)
# ===============================================================

tscv = TimeSeriesSplit(n_splits=5)
oof_preds = np.zeros(len(X_train))

for i, (tr_idx, va_idx) in enumerate(tscv.split(X_train)):
    model = XGBClassifier(**best_params, early_stopping_rounds=50)
    model.fit(
        X_train[tr_idx], y_train[tr_idx],
        eval_set=[(X_train[va_idx], y_train[va_idx])],
        verbose=False
    )
    oof_preds[va_idx] = model.predict_proba(X_train[va_idx])[:, 1]
    print(f"Fold {i+1} AUC: {roc_auc_score(y_train[va_idx], oof_preds[va_idx]):.4f}")

print("OOF AUC:", roc_auc_score(y_train, oof_preds))


# ===============================================================
# 9. FINAL MODEL (TIME-RESPECTING HOLDOUT)
# ===============================================================

X_tr, X_va, y_tr, y_va = train_test_split(
    X_train, y_train, test_size=0.10, shuffle=False
)

final_xgb = XGBClassifier(**best_params, early_stopping_rounds=50)
final_xgb.fit(
    X_tr, y_tr,
    eval_set=[(X_va, y_va)],
    verbose=False
)

print("Best iteration:", final_xgb.best_iteration)


# ===============================================================
# 10. TEST EVALUATION
# ===============================================================

test_probs = final_xgb.predict_proba(X_test)[:, 1]
test_preds = (test_probs >= 0.5).astype(int)

print("\n===== FINAL TEST RESULTS =====")
print("Accuracy:", accuracy_score(y_test, test_preds))
print("AUC:", roc_auc_score(y_test, test_probs))


# ===============================================================
# 11. SAVE STACKING FILES
# ===============================================================

stack_train = pd.DataFrame({"oof_xgb": oof_preds, "y": y_train})
stack_test  = pd.DataFrame({"xgb_test_pred": test_probs, "y": y_test})

print("Saved stack_train and stack_test.")


[I 2025-12-12 22:13:33,337] A new study created in memory with name: no-name-3618ce2b-d2e3-419d-9c34-c99b60118229


[I 2025-12-12 22:13:35,386] Trial 0 finished with value: 0.9052889330546758 and parameters: {'n_estimators': 648, 'learning_rate': 0.12551317382241198, 'max_depth': 3, 'min_child_weight': 2, 'gamma': 1.1984420769747786, 'subsample': 0.9753735637300452, 'colsample_bytree': 0.7042917683569636, 'reg_alpha': 0.48387135723947267, 'reg_lambda': 0.2808930978822641}. Best is trial 0 with value: 0.9052889330546758.
[I 2025-12-12 22:13:37,112] Trial 1 finished with value: 0.9017737799036029 and parameters: {'n_estimators': 675, 'learning_rate': 0.1363919630258007, 'max_depth': 4, 'min_child_weight': 3, 'gamma': 1.5646649939808026, 'subsample': 0.8858330362848159, 'colsample_bytree': 0.7196621859793436, 'reg_alpha': 0.27922124076623167, 'reg_lambda': 0.2623404039339965}. Best is trial 0 with value: 0.9052889330546758.
[I 2025-12-12 22:13:38,745] Trial 2 finished with value: 0.9031316038123559 and parameters: {'n_estimators': 650, 'learning_rate': 0.13579980791331317, 'max_depth': 4, 'min_child_we


===== BEST OPTUNA PARAMS =====
{'n_estimators': 551, 'learning_rate': 0.15914731775867555, 'max_depth': 3, 'min_child_weight': 2, 'gamma': 1.0279761357150037, 'subsample': 0.914458000739117, 'colsample_bytree': 0.7417115727161239, 'reg_alpha': 0.4451986241383403, 'reg_lambda': 0.3781677753684609, 'eval_metric': 'logloss', 'tree_method': 'hist', 'random_state': 42}
Fold 1 AUC: 0.8656
Fold 2 AUC: 0.9065
Fold 3 AUC: 0.9224
Fold 4 AUC: 0.9080
Fold 5 AUC: 0.9274
OOF AUC: 0.7825867329111824
Best iteration: 478

===== FINAL TEST RESULTS =====
Accuracy: 0.793859649122807
AUC: 0.8894383268596313
Saved stack_train and stack_test.


In [None]:
# ===============================================================
#   OPTUNA-TUNED XGBOOST (TSCV + EARLY STOPPING)
#   UFC PREFIGHT MODEL ‚Äî LEAKAGE-FREE + STREAMLIT-READY
# ===============================================================

import numpy as np
import pandas as pd
import optuna
import warnings
import os
import json

warnings.filterwarnings("ignore")

from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from xgboost import XGBClassifier

from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import brier_score_loss
import joblib


# ===============================================================
# 0. SETUP
# ===============================================================

os.makedirs("models", exist_ok=True)


# ===============================================================
# 1. RAW FEATURE SET
# ===============================================================

raw_feature_cols = [
    # Glicko
    "rating_diff", "RD_diff",

    # Physical
    "height_diff", "reach_diff", "age_diff",

    # Fighter A stats
    "SLpM", "SApM", "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg",

    # Fighter B stats
    "opp_SLpM", "opp_SApM", "opp_Str_Acc", "opp_Str_Def",
    "opp_TD_Avg", "opp_TD_Acc", "opp_TD_Def", "opp_Sub_Avg",

    # Temporal (raw)
    "fights_before",
    "days_since_last_fight",
    "win_rate_before",
    "recent_win_rate_3",
    "recent_win_rate_5",

    "opp_fights_before",
    "opp_days_since_last_fight",
    "opp_win_rate_before",
    "opp_recent_win_rate_3",
    "opp_recent_win_rate_5",
]


# ===============================================================
# 2. TRAIN / TEST SPLIT (TIME-AWARE)
# ===============================================================

df_model = df[raw_feature_cols + ["target", "date", "fighter"]].copy()
df_model[raw_feature_cols] = df_model[raw_feature_cols].fillna(0)

train = df_model[df_model["date"] < "2021-01-01"].copy()
test  = df_model[df_model["date"] >= "2021-01-01"].copy()


# ===============================================================
# 3. QUANTILE CLIPPING (FIT ON TRAIN ONLY)
# ===============================================================

clip_config = {
    "days_since_last_fight": (0.001, 0.999),
    "opp_days_since_last_fight": (0.001, 0.999),
    "fights_before": (0.001, 0.999),
    "opp_fights_before": (0.001, 0.999),
}

clip_bounds = {}

for col, (lo_q, hi_q) in clip_config.items():
    lo, hi = train[col].quantile([lo_q, hi_q])

    train[col] = train[col].clip(lo, hi)
    test[col]  = test[col].clip(lo, hi)

    clip_bounds[col] = {"lo": float(lo), "hi": float(hi)}

with open("models/clip_bounds.json", "w") as f:
    json.dump(clip_bounds, f)

print("Saved clip bounds ‚Üí models/clip_bounds.json")


# ===============================================================
# 4. LOG TRANSFORMS
# ===============================================================

log_cols = [
    "days_since_last_fight",
    "opp_days_since_last_fight",
    "fights_before",
    "opp_fights_before",
]

for col in log_cols:
    train[f"log_{col}"] = np.log1p(train[col])
    test[f"log_{col}"]  = np.log1p(test[col])


# ===============================================================
# 5. FINAL FEATURE SET
# ===============================================================

feature_cols = [
    "rating_diff", "RD_diff",
    "height_diff", "reach_diff", "age_diff",

    "SLpM", "SApM", "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg",

    "opp_SLpM", "opp_SApM", "opp_Str_Acc", "opp_Str_Def",
    "opp_TD_Avg", "opp_TD_Acc", "opp_TD_Def", "opp_Sub_Avg",

    "log_fights_before",
    "log_days_since_last_fight",
    "win_rate_before",
    "recent_win_rate_3",
    "recent_win_rate_5",

    "log_opp_fights_before",
    "log_opp_days_since_last_fight",
    "opp_win_rate_before",
    "opp_recent_win_rate_3",
    "opp_recent_win_rate_5",
]

X_train = train[feature_cols].values
y_train = train["target"].values

X_test  = test[feature_cols].values
y_test  = test["target"].values


# ===============================================================
# 6. OPTUNA OBJECTIVE
# ===============================================================

def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 750),
        "learning_rate": trial.suggest_float("learning_rate", 0.10, 0.16),
        "max_depth": trial.suggest_int("max_depth", 3, 4),
        "min_child_weight": trial.suggest_int("min_child_weight", 2, 4),
        "gamma": trial.suggest_float("gamma", 0.6, 1.4),
        "subsample": trial.suggest_float("subsample", 0.85, 0.98),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.65, 0.80),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.20, 0.60),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.20, 0.70),
        "eval_metric": "logloss",
        "tree_method": "hist",
        "random_state": 42,
    }

    tscv = TimeSeriesSplit(n_splits=3)
    aucs = []

    for tr_idx, va_idx in tscv.split(X_train):
        model = XGBClassifier(**params, early_stopping_rounds=50)
        model.fit(
            X_train[tr_idx], y_train[tr_idx],
            eval_set=[(X_train[va_idx], y_train[va_idx])],
            verbose=False
        )
        preds = model.predict_proba(X_train[va_idx])[:, 1]
        aucs.append(roc_auc_score(y_train[va_idx], preds))

    return np.mean(aucs)


# ===============================================================
# 7. RUN OPTUNA
# ===============================================================

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

best_params = study.best_params
best_params.update({
    "eval_metric": "logloss",
    "tree_method": "hist",
    "random_state": 42,
})

print("\n===== BEST OPTUNA PARAMS =====")
print(best_params)


# ===============================================================
# 8. OOF PREDICTIONS
# ===============================================================

tscv = TimeSeriesSplit(n_splits=5)
oof_preds = np.zeros(len(X_train))

for i, (tr_idx, va_idx) in enumerate(tscv.split(X_train)):
    model = XGBClassifier(**best_params, early_stopping_rounds=50)
    model.fit(
        X_train[tr_idx], y_train[tr_idx],
        eval_set=[(X_train[va_idx], y_train[va_idx])],
        verbose=False
    )
    oof_preds[va_idx] = model.predict_proba(X_train[va_idx])[:, 1]
    print(f"Fold {i+1} AUC: {roc_auc_score(y_train[va_idx], oof_preds[va_idx]):.4f}")

print("OOF AUC:", roc_auc_score(y_train, oof_preds))

# ===============================================================
# 8.5 PROBABILITY CALIBRATION (OOF-BASED)
# ===============================================================

calibrator = IsotonicRegression(out_of_bounds="clip")
calibrator.fit(oof_preds, y_train)

# Evaluate calibration quality
print("\n===== CALIBRATION CHECK =====")
print("Brier (uncalibrated):", brier_score_loss(y_train, oof_preds))
print("Brier (calibrated):  ", brier_score_loss(y_train, calibrator.transform(oof_preds)))

# Save calibrator
joblib.dump(calibrator, "models/xgb_calibrator.pkl")
print("Saved probability calibrator ‚Üí models/xgb_calibrator.pkl")


# ===============================================================
# 9. FINAL MODEL (FULL TRAIN FIT ‚Äî DEPLOYMENT SAFE)
# ===============================================================

final_xgb = XGBClassifier(**best_params)
final_xgb.fit(X_train, y_train)

print("Final model trained on full training data")


# ===============================================================
# 10. TEST EVALUATION (CALIBRATED)
# ===============================================================

test_probs_raw = final_xgb.predict_proba(X_test)[:, 1]
test_probs_cal = calibrator.transform(test_probs_raw)

test_preds = (test_probs_cal >= 0.5).astype(int)

print("\n===== FINAL TEST RESULTS (CALIBRATED) =====")
print("Accuracy:", accuracy_score(y_test, test_preds))
print("AUC:", roc_auc_score(y_test, test_probs_cal))
print("Brier:", brier_score_loss(y_test, test_probs_cal))



pd.DataFrame({
    "oof_xgb_raw": oof_preds,
    "oof_xgb_cal": calibrator.transform(oof_preds),
    "y": y_train
}).to_csv("models/stack_train.csv", index=False)

pd.DataFrame({
    "xgb_test_raw": test_probs_raw,
    "xgb_test_cal": test_probs_cal,
    "y": y_test
}).to_csv("models/stack_test.csv", index=False)


# ===============================================================
# 12. SAVE MODEL + FEATURE COLS
# ===============================================================

final_xgb.save_model("models/xgb_prefight_model.json")

with open("models/xgb_feature_cols.json", "w") as f:
    json.dump(feature_cols, f)

print("Saved model + feature columns")

# ===============================================================
# 13. SAVE FEATURE-COMPLETE FIGHTER SNAPSHOT (STREAMLIT-SAFE)
# ===============================================================

fighters_latest = (
    df.sort_values("date")
      .groupby("fighter", as_index=False)
      .tail(1)
)

# Safety check
missing = set(feature_cols) - set(fighters_latest.columns)
assert not missing, f"Missing features: {missing}"

fighters_latest.to_csv("models/fighters_latest.csv", index=False)

print("‚úÖ Saved feature-complete fighters_latest.csv")



[I 2025-12-12 22:14:23,158] A new study created in memory with name: no-name-13a5f7bd-a85d-4246-9776-a23c94fb586b


Saved clip bounds ‚Üí models/clip_bounds.json


[I 2025-12-12 22:14:24,879] Trial 0 finished with value: 0.9038166137636338 and parameters: {'n_estimators': 721, 'learning_rate': 0.14906759410701576, 'max_depth': 4, 'min_child_weight': 3, 'gamma': 0.9679762619167162, 'subsample': 0.9662651387764357, 'colsample_bytree': 0.6634278094039024, 'reg_alpha': 0.410858431713551, 'reg_lambda': 0.6720686141245213}. Best is trial 0 with value: 0.9038166137636338.
[I 2025-12-12 22:14:26,746] Trial 1 finished with value: 0.9027350322730632 and parameters: {'n_estimators': 668, 'learning_rate': 0.13613151671314846, 'max_depth': 4, 'min_child_weight': 4, 'gamma': 0.8381996754010844, 'subsample': 0.9442324278063621, 'colsample_bytree': 0.6504248416003906, 'reg_alpha': 0.23079541323808994, 'reg_lambda': 0.3492997826722123}. Best is trial 0 with value: 0.9038166137636338.
[I 2025-12-12 22:14:28,491] Trial 2 finished with value: 0.9032319115777477 and parameters: {'n_estimators': 623, 'learning_rate': 0.15169695073361003, 'max_depth': 4, 'min_child_wei


===== BEST OPTUNA PARAMS =====
{'n_estimators': 692, 'learning_rate': 0.14082808651298515, 'max_depth': 3, 'min_child_weight': 2, 'gamma': 1.0448206710514119, 'subsample': 0.8521052087418425, 'colsample_bytree': 0.7529344677758869, 'reg_alpha': 0.4492207551438479, 'reg_lambda': 0.5705957276012827, 'eval_metric': 'logloss', 'tree_method': 'hist', 'random_state': 42}
Fold 1 AUC: 0.8670
Fold 2 AUC: 0.9066
Fold 3 AUC: 0.9262
Fold 4 AUC: 0.9074
Fold 5 AUC: 0.9282
OOF AUC: 0.7829695188071972

===== CALIBRATION CHECK =====
Brier (uncalibrated): 0.18609025303113208
Brier (calibrated):   0.16245471248242513
Saved probability calibrator ‚Üí models/xgb_calibrator.pkl
Final model trained on full training data

===== FINAL TEST RESULTS (CALIBRATED) =====
Accuracy: 0.8070175438596491
AUC: 0.8863104572544891
Brier: 0.14332624549891734
Saved model + feature columns
Saved fighters_latest.csv


### GRU Sequence Embeddings & XGBOOST

In [155]:
# ===============================================================
# CHUNK 1: IMPORTS, SETTINGS, STYLE FEATURES
# ===============================================================

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import optuna
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss
from sklearn.isotonic import IsotonicRegression
from xgboost import XGBClassifier
import joblib
import json

device = (
    torch.device("cuda") if torch.cuda.is_available()
    else torch.device("mps") if torch.backends.mps.is_available()
    else torch.device("cpu")
)

SEQ_LEN = 5
EMBED_DIM = 32
HIDDEN_DIM = 64
BATCH_SIZE = 64
EPOCHS = 25
LR = 1e-3

SEQ_FEATURES = [
    "SLpM", "SApM",
    "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc",
    "TD_Def", "Sub_Avg",
]

STYLE_TARGETS = [
    "SLpM",
    "TD_Avg",
    "Sub_Avg",
    "Str_Acc",
]


In [156]:
# ===============================================================
# CHUNK 2: SEQUENCE BUILDING + GRU STYLE ENCODER
# ===============================================================

def build_fighter_sequences(df):
    rows = []
    for fighter, g in df.groupby("fighter"):
        g = g.sort_values("date").reset_index(drop=True)
        history = []

        for i in range(len(g)):
            if i > 0:
                history.append(g.loc[i-1, SEQ_FEATURES].values)

            seq = history[-SEQ_LEN:]
            if len(seq) < SEQ_LEN:
                pad = [np.zeros(len(SEQ_FEATURES))] * (SEQ_LEN - len(seq))
                seq = pad + seq

            rows.append({
                "fight_url": g.loc[i, "fight_url"],
                "fighter": fighter,
                "date": g.loc[i, "date"],
                "sequence": np.stack(seq).astype(np.float32),
                "target": g.loc[i, STYLE_TARGETS].values.astype(np.float32)
            })
    return pd.DataFrame(rows)


class FightSeqDataset(Dataset):
    def __init__(self, df):
        self.X = np.stack(df["sequence"].values)
        self.y = np.stack(df["target"].values)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]), torch.from_numpy(self.y[idx])


class FighterGRU(nn.Module):
    def __init__(self):
        super().__init__()
        self.gru = nn.GRU(len(SEQ_FEATURES), HIDDEN_DIM, batch_first=True)
        self.embed = nn.Linear(HIDDEN_DIM, EMBED_DIM)
        self.head  = nn.Linear(HIDDEN_DIM, len(STYLE_TARGETS))

    def forward(self, x):
        h, _ = self.gru(x)
        ctx = h.mean(dim=1)
        emb = self.embed(ctx)
        out = self.head(ctx)
        return emb, out


In [157]:
# ===============================================================
# CHUNK 3: TRAIN GRU + EXTRACT STYLE EMBEDDINGS
# ===============================================================

df[SEQ_FEATURES] = df[SEQ_FEATURES].fillna(0)
df_seq = build_fighter_sequences(df)

train_seq = df_seq[df_seq["date"] < "2021-01-01"]
test_seq  = df_seq[df_seq["date"] >= "2021-01-01"]

train_loader = DataLoader(
    FightSeqDataset(train_seq),
    batch_size=BATCH_SIZE,
    shuffle=True
)

gru = FighterGRU().to(device)
optimizer = torch.optim.Adam(gru.parameters(), lr=LR)
criterion = nn.MSELoss()

gru.train()
for epoch in range(EPOCHS):
    losses = []
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        _, preds = gru(X)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f"Epoch {epoch+1}/{EPOCHS} | Style MSE: {np.mean(losses):.4f}")

gru.eval()
with torch.no_grad():
    all_X = torch.stack([
        torch.tensor(seq, dtype=torch.float32)
        for seq in df_seq["sequence"].values
    ]).to(device)

    all_emb, _ = gru(all_X)
    all_emb = all_emb.cpu().numpy()

emb_cols = [f"gru_style_{i}" for i in range(EMBED_DIM)]
emb_df = pd.DataFrame(all_emb, columns=emb_cols)
emb_df.insert(0, "fighter", df_seq["fighter"].values)
emb_df.insert(0, "fight_url", df_seq["fight_url"].values)


Epoch 1/25 | Style MSE: 1.3277
Epoch 2/25 | Style MSE: 0.4854
Epoch 3/25 | Style MSE: 0.3827
Epoch 4/25 | Style MSE: 0.3233
Epoch 5/25 | Style MSE: 0.2885
Epoch 6/25 | Style MSE: 0.2649
Epoch 7/25 | Style MSE: 0.2463
Epoch 8/25 | Style MSE: 0.2325
Epoch 9/25 | Style MSE: 0.2240
Epoch 10/25 | Style MSE: 0.2137
Epoch 11/25 | Style MSE: 0.2070
Epoch 12/25 | Style MSE: 0.2018
Epoch 13/25 | Style MSE: 0.1963
Epoch 14/25 | Style MSE: 0.1931
Epoch 15/25 | Style MSE: 0.1896
Epoch 16/25 | Style MSE: 0.1896
Epoch 17/25 | Style MSE: 0.1846
Epoch 18/25 | Style MSE: 0.1822
Epoch 19/25 | Style MSE: 0.1819
Epoch 20/25 | Style MSE: 0.1806
Epoch 21/25 | Style MSE: 0.1790
Epoch 22/25 | Style MSE: 0.1784
Epoch 23/25 | Style MSE: 0.1772
Epoch 24/25 | Style MSE: 0.1774
Epoch 25/25 | Style MSE: 0.1769


In [158]:
# ===============================================================
# CHUNK 4: XGBOOST PIPELINE (OOF + CALIBRATION)
# ===============================================================

df = df.merge(emb_df, on=["fight_url", "fighter"], how="left")
df = df.merge(
    emb_df,
    left_on=["fight_url", "opponent"],
    right_on=["fight_url", "fighter"],
    how="left",
    suffixes=("", "_opp")
)

for i in range(EMBED_DIM):
    df[f"gru_style_diff_{i}"] = df[f"gru_style_{i}"] - df[f"gru_style_{i}_opp"]

feature_cols = [
    "rating_diff", "RD_diff",
    "height_diff", "reach_diff", "age_diff",

    "SLpM", "SApM", "Str_Acc", "Str_Def",
    "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg",

    "opp_SLpM", "opp_SApM", "opp_Str_Acc", "opp_Str_Def",
    "opp_TD_Avg", "opp_TD_Acc", "opp_TD_Def", "opp_Sub_Avg",

    "win_rate_before",
    "recent_win_rate_3",
    "recent_win_rate_5",
    "opp_win_rate_before",
    "opp_recent_win_rate_3",
    "opp_recent_win_rate_5",
] + [f"gru_style_diff_{i}" for i in range(EMBED_DIM)]

df_model = df[feature_cols + ["target", "date"]].fillna(0)
train = df_model[df_model["date"] < "2021-01-01"]
test  = df_model[df_model["date"] >= "2021-01-01"]

X_train = train[feature_cols].values
y_train = train["target"].values
X_test  = test[feature_cols].values
y_test  = test["target"].values

params = {
    "n_estimators": 600,
    "learning_rate": 0.12,
    "max_depth": 3,
    "subsample": 0.9,
    "colsample_bytree": 0.7,
    "eval_metric": "logloss",
    "tree_method": "hist",
    "random_state": 42,
}

tscv = TimeSeriesSplit(n_splits=5)
oof = np.zeros(len(X_train))

for i, (tr, va) in enumerate(tscv.split(X_train)):
    model = XGBClassifier(**params)
    model.fit(X_train[tr], y_train[tr])
    oof[va] = model.predict_proba(X_train[va])[:, 1]
    print(f"Fold {i+1} AUC:", roc_auc_score(y_train[va], oof[va]))

cal = IsotonicRegression(out_of_bounds="clip")
cal.fit(oof, y_train)

final_xgb = XGBClassifier(**params)
final_xgb.fit(X_train, y_train)

test_probs = cal.transform(final_xgb.predict_proba(X_test)[:, 1])

print("\n===== FINAL RESULTS =====")
print("Accuracy:", accuracy_score(y_test, test_probs >= 0.5))
print("AUC:", roc_auc_score(y_test, test_probs))
print("Brier:", brier_score_loss(y_test, test_probs))

# ===============================================================
# CHUNK 5: SAVE ARTIFACTS (GRU + XGB + CALIBRATION)
# ===============================================================

import os
os.makedirs("models", exist_ok=True)

# 1Ô∏è‚É£ Save final XGBoost model
final_xgb.save_model("models/gru_xgb_prefight_model.json")

# 2Ô∏è‚É£ Save isotonic calibrator
joblib.dump(cal, "models/gru_xgb_isotonic_calibrator.pkl")

# 3Ô∏è‚É£ Save feature column order (CRITICAL)
with open("models/gru_xgb_feature_cols.json", "w") as f:
    json.dump(feature_cols, f, indent=2)

# 4Ô∏è‚É£ Save GRU encoder weights
torch.save(gru.state_dict(), "models/gru_style_encoder.pt")

# 5Ô∏è‚É£ Save GRU config (so loading is deterministic)
gru_config = {
    "seq_len": SEQ_LEN,
    "embed_dim": EMBED_DIM,
    "hidden_dim": HIDDEN_DIM,
    "seq_features": SEQ_FEATURES,
    "style_targets": STYLE_TARGETS,
}

with open("models/gru_style_config.json", "w") as f:
    json.dump(gru_config, f, indent=2)

print("‚úÖ Saved GRU encoder, XGB model, calibrator, and metadata")



Fold 1 AUC: 0.8398803697351411
Fold 2 AUC: 0.8785655758693092
Fold 3 AUC: 0.8986671257551484
Fold 4 AUC: 0.8885189927443449
Fold 5 AUC: 0.916091947479727

===== FINAL RESULTS =====
Accuracy: 0.745958751393534
AUC: 0.8253891020358957
Brier: 0.1745996419962172
‚úÖ Saved GRU encoder, XGB model, calibrator, and metadata
