In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re
import numpy as np

In [9]:
def get_soup(url, max_retries=3):
    headers = {"User-Agent": "Mozilla/5.0"}
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return BeautifulSoup(response.content, 'html.parser')
            else:
                print(f"Warning: Status code {response.status_code} for {url}")
                time.sleep(2)
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)
    return None

def clean_text(text):
    if text:
        return re.sub(r'\s+', ' ', text.strip())
    return ''

def strip_label(full_text, label_with_colon):
    if not full_text:
        return ''
    pattern = r'^\s*' + re.escape(label_with_colon) + r'\s*'
    return re.sub(pattern, '', full_text, flags=re.IGNORECASE).strip()

In [None]:
# Load fighters from the cleaned CSV
try:
    fighters_df = pd.read_csv('fighter_stats_cleaned.csv')
    fighter_ids = fighters_df['fighter_id'].unique().tolist()
    print(f"Loaded {len(fighter_ids)} unique fighters.")
except FileNotFoundError:
    print("Error: fighter_stats_cleaned.csv not found. Please run the fighter scraper first.")
    fighter_ids = []

Loaded 10 unique fighters.


In [11]:
def get_fight_urls(fighter_id):
    """Get list of fight URLs for a given fighter ID."""
    url = f"http://ufcstats.com/fighter-details/{fighter_id}"
    soup = get_soup(url)
    fight_urls = []
    
    if not soup:
        return []
        
    # Find the fights table
    table = soup.find('table', class_='b-fight-details__table')
    if not table:
        return []
        
    rows = table.find_all('tr')[1:] # Skip header
    for row in rows:
        cols = row.find_all('td')
        if not cols:
            continue
            
        first_col = cols[0]
        link = first_col.find('a')
        if link and 'href' in link.attrs:
            fight_urls.append(link['href'])
            
    return fight_urls

In [12]:
def scrape_fight_details(fight_url):
    """Scrape detailed stats for a single fight."""
    soup = get_soup(fight_url)
    if not soup:
        return None
        
    fight_data = {}
    fight_data['fight_id'] = fight_url.split('/')[-1]
    
    # --- Event ID ---
    event_link = soup.find('a', href=re.compile(r'event-details'))
    if event_link:
        fight_data['event_id'] = event_link['href'].split('/')[-1]
    else:
        fight_data['event_id'] = None
        
    # --- Fighters ---
    fighters = soup.find_all('div', class_='b-fight-details__person')
    if len(fighters) >= 2:
        f1_div = fighters[0]
        f2_div = fighters[1]
        
        f1_link = f1_div.find('a', class_='b-link b-fight-details__person-link')
        f2_link = f2_div.find('a', class_='b-link b-fight-details__person-link')
        
        fight_data['fighter1_id'] = f1_link['href'].split('/')[-1] if f1_link else None
        fight_data['fighter2_id'] = f2_link['href'].split('/')[-1] if f2_link else None
        
        fight_data['fighter1_name'] = clean_text(f1_link.text) if f1_link else None
        fight_data['fighter2_name'] = clean_text(f2_link.text) if f2_link else None
        
        f1_status = clean_text(f1_div.find('i', class_='b-fight-details__person-status').text)
        f2_status = clean_text(f2_div.find('i', class_='b-fight-details__person-status').text)
        
        if f1_status == 'W':
            fight_data['winner_id'] = fight_data['fighter1_id']
        elif f2_status == 'W':
            fight_data['winner_id'] = fight_data['fighter2_id']
        else:
            fight_data['winner_id'] = None
            
    # --- Fight Meta ---
    details_div = soup.find('div', class_='b-fight-details__fight')
    if details_div:
        fight_data['weight_class'] = clean_text(soup.find('i', class_='b-fight-details__fight-title').text)
        
        for item in details_div.find_all('i', class_='b-fight-details__text-item_first'):
            label = clean_text(item.find('i', class_='b-fight-details__label').text)
            value = clean_text(item.get_text().replace(label, ''))
            
            if 'Method' in label:
                fight_data['method'] = value
            elif 'Round' in label:
                fight_data['round'] = value
            elif 'Time' in label:
                fight_data['time'] = value
                
    # --- Stats Tables ---
    tables = soup.find_all('table', class_='b-fight-details__table')
    
    if len(tables) >= 1:
        totals_row = tables[0].find('tbody').find('tr')
        cols = totals_row.find_all('td')
        
        def get_p_values(col_idx):
            p_tags = cols[col_idx].find_all('p')
            return [clean_text(p.text) for p in p_tags]
            
        kds = get_p_values(1)
        fight_data['fighter1_KD'] = kds[0]
        fight_data['fighter2_KD'] = kds[1]
        
        sig_str = get_p_values(2)
        fight_data['fighter1_SigStrikes_Landed'] = sig_str[0].split(' of ')[0]
        fight_data['fighter1_SigStrikes_Att'] = sig_str[0].split(' of ')[1]
        fight_data['fighter2_SigStrikes_Landed'] = sig_str[1].split(' of ')[0]
        fight_data['fighter2_SigStrikes_Att'] = sig_str[1].split(' of ')[1]
        
        tot_str = get_p_values(4)
        fight_data['fighter1_TotalStrikes_Landed'] = tot_str[0].split(' of ')[0]
        fight_data['fighter1_TotalStrikes_Att'] = tot_str[0].split(' of ')[1]
        fight_data['fighter2_TotalStrikes_Landed'] = tot_str[1].split(' of ')[0]
        fight_data['fighter2_TotalStrikes_Att'] = tot_str[1].split(' of ')[1]
        
        tds = get_p_values(5)
        fight_data['fighter1_Takedowns_Landed'] = tds[0].split(' of ')[0]
        fight_data['fighter1_Takedowns_Att'] = tds[0].split(' of ')[1]
        fight_data['fighter2_Takedowns_Landed'] = tds[1].split(' of ')[0]
        fight_data['fighter2_Takedowns_Att'] = tds[1].split(' of ')[1]
        
        subs = get_p_values(7)
        fight_data['fighter1_Submission_Att'] = subs[0]
        fight_data['fighter2_Submission_Att'] = subs[1]
        
        revs = get_p_values(8)
        fight_data['fighter1_Reversals'] = revs[0]
        fight_data['fighter2_Reversals'] = revs[1]
        
        ctrl = get_p_values(9)
        fight_data['fighter1_Control_time'] = ctrl[0]
        fight_data['fighter2_Control_time'] = ctrl[1]
        
    if len(tables) >= 2:
        sig_row = tables[1].find('tbody').find('tr')
        cols = sig_row.find_all('td')
        
        def get_p_values_sig(col_idx):
            p_tags = cols[col_idx].find_all('p')
            return [clean_text(p.text) for p in p_tags]
            
        head = get_p_values_sig(3)
        fight_data['fighter1_Head_strikes'] = head[0].split(' of ')[0]
        fight_data['fighter2_Head_strikes'] = head[1].split(' of ')[0]
        
        body = get_p_values_sig(4)
        fight_data['fighter1_Body_strikes'] = body[0].split(' of ')[0]
        fight_data['fighter2_Body_strikes'] = body[1].split(' of ')[0]
        
        leg = get_p_values_sig(5)
        fight_data['fighter1_Leg_strikes'] = leg[0].split(' of ')[0]
        fight_data['fighter2_Leg_strikes'] = leg[1].split(' of ')[0]
        
        dist = get_p_values_sig(6)
        fight_data['fighter1_Distance_strikes'] = dist[0].split(' of ')[0]
        fight_data['fighter2_Distance_strikes'] = dist[1].split(' of ')[0]
        
        clinch = get_p_values_sig(7)
        fight_data['fighter1_Clinch_strikes'] = clinch[0].split(' of ')[0]
        fight_data['fighter2_Clinch_strikes'] = clinch[1].split(' of ')[0]
        
        ground = get_p_values_sig(8)
        fight_data['fighter1_Ground_strikes'] = ground[0].split(' of ')[0]
        fight_data['fighter2_Ground_strikes'] = ground[1].split(' of ')[0]
        
    # Validate data: Check for missing values (None, empty string, or '--')
    for key, value in fight_data.items():
        if value is None or value == "" or value == "--":
            return None
            
    return fight_data

In [None]:
# Main Execution Loop
all_fights = []
processed_fight_ids = set()

# Limit for testing (remove or increase for full run)
TEST_LIMIT = 10

print(f"Starting scrape for {len(fighter_ids)} fighters...")

for i, f_id in enumerate(fighter_ids):
    if TEST_LIMIT and i >= TEST_LIMIT:
        break
        
    print(f"Processing fighter {i+1}/{len(fighter_ids)}: {f_id}")
    
    fight_urls = get_fight_urls(f_id)
    
    for url in fight_urls:
        fight_id = url.split('/')[-1]
        
        if fight_id in processed_fight_ids:
            continue
            
        print(f"  Scraping fight: {fight_id}")
        try:
            fight_data = scrape_fight_details(url)
            if fight_data:
                all_fights.append(fight_data)
                processed_fight_ids.add(fight_id)
        except Exception as e:
            print(f"  Error scraping fight {fight_id}: {e}")
            
        time.sleep(1) # Be respectful
        
    time.sleep(1)

print(f"Scraping complete. Total unique fights: {len(all_fights)}")

# Save to CSV
fights_df = pd.DataFrame(all_fights)
fights_df.to_csv('fights_cleaned.csv', index=False)
print("Saved to fights_cleaned.csv")
display(fights_df.head())

Starting scrape for 10 fighters...
Processing fighter 1/10: 59a9d6dac61c2540
  Scraping fight: ce99b089400a4ad3
Processing fighter 2/10: 3329d692aea4dc28
  Scraping fight: 3fdb4c4b39218cf4
  Scraping fight: a12d3a6c8aaf87ca
  Scraping fight: 984ce1062186f3ce
  Scraping fight: 5a2b86570110191b
Processing fighter 3/10: 2f5cbecbbe18bac4
  Scraping fight: fffdc57255274be1
  Scraping fight: ae989e21c3839b49
  Scraping fight: f085f32bbb3220e1
  Scraping fight: 066756ea40b134a7
  Scraping fight: 0bc7e4852f75a06d
  Scraping fight: a7e8416f0d57c8e0
  Scraping fight: 4d5c3ddc740b89ca
  Scraping fight: cb1e57b5f8c92922
  Scraping fight: 412735f3e0b66ea6
  Scraping fight: e508ba268cb295c0
  Scraping fight: f9373ab59b7a853b
Processing fighter 4/10: 7279654c7674cd24
  Scraping fight: 0fcf42b68f2f0fa3
  Scraping fight: f820c6292e19d854
  Scraping fight: f711b3ec6bc3c407
  Scraping fight: 14c505d7ad55883c
Processing fighter 5/10: 989b85f6540c86b1
  Scraping fight: 99d0517adc67fc40
Processing fighter 6

Unnamed: 0,fight_id,event_id,fighter1_id,fighter2_id,fighter1_name,fighter2_name,winner_id,weight_class,method,fighter1_KD,...,fighter1_Body_strikes,fighter2_Body_strikes,fighter1_Leg_strikes,fighter2_Leg_strikes,fighter1_Distance_strikes,fighter2_Distance_strikes,fighter1_Clinch_strikes,fighter2_Clinch_strikes,fighter1_Ground_strikes,fighter2_Ground_strikes
0,ce99b089400a4ad3,66e981516e2476d1,eb393afdbe3293d5,59a9d6dac61c2540,Ismael Bonfim,Nariman Abbasov,eb393afdbe3293d5,Lightweight Bout,Decision - Unanimous,1,...,8,4,3,2,32,14,5,0,3,0
1,3fdb4c4b39218cf4,7956f026e2672c47,3329d692aea4dc28,7a5e8c94a86f9895,Hamdy Abdelwahab,Chris Barnett,3329d692aea4dc28,Heavyweight Bout,Decision - Unanimous,0,...,5,4,1,1,10,5,6,3,25,0
2,a12d3a6c8aaf87ca,400c7b43c86d27d3,3329d692aea4dc28,da7f113b5ea39c43,Hamdy Abdelwahab,Mohammed Usman,da7f113b5ea39c43,Heavyweight Bout,Decision - Unanimous,0,...,1,1,6,0,16,9,0,0,0,0
3,984ce1062186f3ce,80dbeb1dd5b53e64,3329d692aea4dc28,f41b9f5efc7162d6,Hamdy Abdelwahab,Jamal Pogues,3329d692aea4dc28,Heavyweight Bout,Decision - Split,0,...,0,0,3,8,7,22,0,0,4,0
4,fffdc57255274be1,5717efc6f271cd52,2f5cbecbbe18bac4,41e83a89929d1327,Shamil Abdurakhimov,Jailton Almeida,41e83a89929d1327,Heavyweight Bout,KO/TKO,0,...,0,0,0,0,1,0,0,0,0,14


In [None]:
fights_df = pd.read_csv('fights_cleaned.csv')
print(fights_df.dtypes)

FileNotFoundError: [Errno 2] No such file or directory: 'ufc_fights_cleaned.csv'