In [2]:
#1. Import libraries

import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re

In [3]:
import requests
from bs4 import BeautifulSoup
import time

# Corrected function
def get_soup(url, max_retries=3): # Changed parameter name to match logic below
    # Defined headers dictionary
    headers = {"User-Agent": "Mozilla/5.0"}

    for attempt in range(max_retries):
        try:
            # specific timeout ensures it doesn't hang indefinitely
            response = requests.get(url, headers=headers, timeout=10) 
            
            if response.status_code == 200:
                print(f"Success: Retrieved {url}")
                return BeautifulSoup(response.content, 'html.parser')
            else:
                print(f"Warning: Status code {response.status_code} for {url}")
                time.sleep(2)
                
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying... ({attempt + 1}/{max_retries})")
                time.sleep(2)
                
    return None

In [None]:
# Define the URL you want to test
test_url = "http://ufcstats.com/statistics/fighters"

# 1. Call the function
soup = get_soup(test_url)

# 2. Verify the output
if soup:
    print("\n--- TEST PASSED ---")
    print(f"Object Type: {type(soup)}")
    
    # 3. Check specific content to ensure it's the right page
    # The title tag is usually the best quick check
    page_title = soup.title.text.strip() if soup.title else "No Title Found"
    print(f"Page Title: '{page_title}'")
    
    # 4. (Optional) Print the first 500 characters of HTML to see structure
    print("\n--- HTML PREVIEW (First 500 chars) ---")
    print(soup.prettify()[:500])
else:
    print("\n--- TEST FAILED ---")
    print("The function returned None.")

In [None]:
#3. Helper Functions NOT USING CURRENTLY

def clean_height(height_str):
    """Convert height from feet-inches format to total inches."""
    if not height_str or height_str == 'N/A':
        return None
    match = re.match(r"(\d+)' (\d+)\"", height_str)
    if match:
        feet = int(match.group(1))
        inches = int(match.group(2))
        return feet * 12 + inches
    return None

def clean_weight(weight_str):
    """Convert weight from string to integer pounds."""
    if not weight_str or weight_str == 'N/A':
        return None
    try:
        return int(weight_str.replace(' lbs.', '').strip())
    except ValueError:
        return None
    
def clean_reach(reach_str):
    """Convert reach from string to integer inches."""
    if not reach_str or reach_str == 'N/A':
        return None
    try:
        return int(reach_str.replace(' in.', '').strip())
    except ValueError:
        return None
    
def clean_time_to_seconds(time_str):
    """Convert time from MM:SS format to total seconds."""
    if not time_str or time_str == 'N/A':
        return None
    try:
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds
    except ValueError:
        return None    

In [None]:
def strip_label(full_text, label_with_colon):
    """Remove a leading 'Label:' (case-insensitive) from 'Label: value'."""
    if not full_text:
        return ''
    pattern = r'^\s*' + re.escape(label_with_colon) + r'\s*'
    return re.sub(pattern, '', full_text, flags=re.IGNORECASE).strip()

def clean_text(text):
    """Clean and normalize text data"""
    if text:
        return re.sub(r'\s+', ' ', text.strip())
    return ''

def parse_percentage(text):
    """Convert percentage text to float"""
    if text and '%' in text:
        return float(text.replace('%', '').strip())
    return None

In [None]:
# Function to get advanced fighter details from fighter's main page (ex.SLpM, Str. Acc., SApM, Str. Def., TD Avg., TD Acc., TD Def., Sub. Avg.)

def get_fighter_advanced_details(fighter_url):
    fighter_soup = get_soup(fighter_url)
    if not fighter_soup:
        print(f"Failed to retrieve fighter page: {fighter_url}")
        return {}
    
    details = {}

    # Get fighter bio information
    bio_box = soup.find('div', class_='b-list__info-box')
    if bio_box:
        bio_items = bio_box.find_all('li', class_='b-list__box-list-item')
        for item in bio_items:
            label_tag = item.find('i')
            if not label_tag:
                continue

            label_raw = clean_text(label_tag.text)          # e.g., 'DOB:'
            label_key = label_raw.rstrip(':').lower()       # 'dob'

            # Full text still includes the label; strip it out
            value_full = clean_text(item.get_text(separator=' '))
            value_only = strip_label(value_full, label_raw) # e.g., 'Jan 01, 1990'

            if label_key == 'height':
                details['height'] = value_only
            elif label_key == 'weight':
                details['weight'] = value_only
            elif label_key == 'reach':
                details['reach'] = value_only
            elif label_key == 'stance':
                details['stance'] = value_only
            elif label_key == 'dob':
                details['dob'] = value_only


    
    # Get career statistics
    career_stats = soup.find_all('div', class_='b-list__info-box-left')
    for stat_box in career_stats:
        stat_items = stat_box.find_all('li', class_='b-list__box-list-item')
        for item in stat_items:
            label_elem = item.find('i')
            if label_elem and label_elem.text:
                label = clean_text(label_elem.text)
                value = clean_text(item.text.replace(label, ''))
                
                if 'SLpM' in label:
                    details['strikes_landed_per_min'] = value
                elif 'Str. Acc' in label:
                    details['striking_accuracy'] = value
                elif 'SApM' in label:
                    details['strikes_absorbed_per_min'] = value
                elif 'Str. Def' in label:
                    details['striking_defense'] = value
                elif 'TD Avg' in label:
                    details['takedown_avg'] = value
                elif 'TD Acc' in label:
                    details['takedown_accuracy'] = value
                elif 'TD Def' in label:
                    details['takedown_defense'] = value
                elif 'Sub. Avg' in label:
                    details['submission_avg'] = value
    
    return details

In [None]:
# Scrape Fighter Data

def get_fighter_data(): 
    base_url = "http://ufcstats.com/statistics/fighters"
    fighters_data = []

    # Get all letter pages (a-z)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        letter_url = f"{base_url}?char={letter}&page=all"
        print(f"\nScraping fighters starting with '{letter.upper()}' from {letter_url}")
    
        soup = get_soup(letter_url)
        if not soup:
            print(f"Failed to retrieve data for letter '{letter.upper()}'. Skipping...")
            continue

        # Find fighter table
        table = soup.find('table', class_='b-statistics__table')
        if not table:
            continue
        
        rows = table.find_all('tr')[1:]  # Skip header row
        
        for row in rows:
            cols = row.find_all('td')
            if len(cols) < 10:
                fighter_info = {
                    'fighter_id': cols[0].find('a')['href'].split('/')[-1] if cols[0].find('a') else None,
                    'firstname': clean_text(cols[0].text),
                    'lastname' : clean_text(cols[1].text),
                    'nickname': clean_text(cols[2].text),

                    '''
                    'height': clean_text(cols[3].text),
                    'weight': clean_text(cols[4].text),
                    'reach': clean_text(cols[5].text),
                    'stance': clean_text(cols[6].text),
                    '''
                    
                    'wins': clean_text(cols[7].text),
                    'losses': clean_text(cols[8].text),
                    'draws': clean_text(cols[9].text),
                }

                # Get fighter url and parse additional details from fighters main page
                fighter_url = f"http://ufcstats.com/fighter-details/{fighter_info['fighter_id']}"
                fighter_details = get_fighter_advanced_details(fighter_url)
                
                fighter_full = {**fighter_info, **fighter_details}

                fields = [fighter_id, firstname, lastname, wins, losses, draws, height, weight, reach, stance, dob, strikes_landed_per_min, 
                          striking_accuracy, strikes_absorbed_per_min, striking_defense, takedown_avg, takedown_accuracy, takedown_defense, 
                          submission_avg]
                
                missing = any(field == "" or field == "--" for field in fields)
                if missing:
                    print(f"Skipping incomplete data for fighter: {fighter_full}") #can change to return first and last names
                    continue

                fighters_data.append(fighter_full)
        
        # Be respectful with request rate
        time.sleep(1)
    
    print(f"Total fighters found: {len(fighters_data)}")
    return fighters_data