In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re
from datetime import datetime

In [None]:
def get_soup(url, max_retries=3):
    headers = {"User-Agent": "Mozilla/5.0"}
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return BeautifulSoup(response.content, 'html.parser')
            else:
                print(f"Warning: Status code {response.status_code} for {url}")
                time.sleep(2)
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            if attempt < max_retries - 1:
                time.sleep(2)
    return None

def clean_text(text):
    if text:
        return re.sub(r'\s+', ' ', text.strip())
    return ''

In [None]:
def get_all_events():
    """Scrape all completed UFC events."""
    url = "http://ufcstats.com/statistics/events/completed?page=all"
    print(f"Scraping events from {url}...")
    
    soup = get_soup(url)
    if not soup:
        print("Failed to retrieve events page.")
        return []
        
    events_data = []
    
    # Find the events table
    table = soup.find('table', class_='b-statistics__table-events')
    if not table:
        print("Could not find events table.")
        return []
        
    # Rows (skip header)
    rows = table.find_all('tr')[1:]
    
    for row in rows:
        cols = row.find_all('td')
        if len(cols) < 2:
            continue
            
        # Column 0: Event Name and Date (Date is usually in a span or secondary text)
        # Actually, looking at the structure:
        # Col 0: Link to event (Name) + Date below it
        # Col 1: Location
        
        # Let's inspect Col 0 more closely
        col0 = cols[0]
        link = col0.find('a')
        date_span = col0.find('span', class_='b-statistics__date')
        
        if not link:
            continue
            
        event_name = clean_text(link.text)
        event_url = link['href']
        event_id = event_url.split('/')[-1]
        
        date_str = clean_text(date_span.text) if date_span else None
        
        # Col 1: Location
        location = clean_text(cols[1].text)
        
        # Convert date to YYYY-MM-DD
        formatted_date = None
        if date_str:
            try:
                # Format is usually "Month DD, YYYY" e.g. "November 11, 2023"
                dt = datetime.strptime(date_str, "%B %d, %Y")
                formatted_date = dt.strftime("%Y-%m-%d")
            except ValueError:
                formatted_date = date_str # Keep original if parse fails
        
        events_data.append({
            'event_id': event_id,
            'event_name': event_name,
            'date': formatted_date,
            'location': location
        })
        
    print(f"Found {len(events_data)} events.")
    return events_data

In [None]:
# Run the scraper
events = get_all_events()

# Create DataFrame
events_df = pd.DataFrame(events)

# Save to CSV
events_df.to_csv('ufc_events.csv', index=False)
print("Saved to ufc_events.csv")

# Display
display(events_df.head())