In [None]:
#1. Import libraries

import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import re

In [None]:
import requests
from bs4 import BeautifulSoup
import time

# Corrected function
def get_soup(url, max_retries=3): # Changed parameter name to match logic below
    # Defined headers dictionary
    headers = {"User-Agent": "Mozilla/5.0"}

    for attempt in range(max_retries):
        try:
            # specific timeout ensures it doesn't hang indefinitely
            response = requests.get(url, headers=headers, timeout=10) 
            
            if response.status_code == 200:
                print(f"Success: Retrieved {url}")
                return BeautifulSoup(response.content, 'html.parser')
            else:
                print(f"Warning: Status code {response.status_code} for {url}")
                time.sleep(2)
                
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying... ({attempt + 1}/{max_retries})")
                time.sleep(2)
                
    return None

In [None]:
# Define the URL you want to test
test_url = "http://ufcstats.com/statistics/fighters"

# 1. Call the function
soup = get_soup(test_url)

# 2. Verify the output
if soup:
    print("\n--- TEST PASSED ---")
    print(f"Object Type: {type(soup)}")
    
    # 3. Check specific content to ensure it's the right page
    # The title tag is usually the best quick check
    page_title = soup.title.text.strip() if soup.title else "No Title Found"
    print(f"Page Title: '{page_title}'")
    
    # 4. (Optional) Print the first 500 characters of HTML to see structure
    print("\n--- HTML PREVIEW (First 500 chars) ---")
    print(soup.prettify()[:500])
else:
    print("\n--- TEST FAILED ---")
    print("The function returned None.")

In [None]:
def strip_label(full_text, label_with_colon):
    """Remove a leading 'Label:' (case-insensitive) from 'Label: value'."""
    if not full_text:
        return ''
    pattern = r'^\s*' + re.escape(label_with_colon) + r'\s*'
    return re.sub(pattern, '', full_text, flags=re.IGNORECASE).strip()

def clean_text(text):
    """Clean and normalize text data"""
    if text:
        return re.sub(r'\s+', ' ', text.strip())
    return ''

def parse_percentage(text):
    """Convert percentage text to float"""
    if text and '%' in text:
        return float(text.replace('%', '').strip())
    return None

In [None]:
# Function to get advanced fighter details from fighter's main page (ex.SLpM, Str. Acc., SApM, Str. Def., TD Avg., TD Acc., TD Def., Sub. Avg.)

def get_fighter_advanced_details(fighter_url):
    fighter_soup = get_soup(fighter_url)
    if not fighter_soup:
        print(f"Failed to retrieve fighter page: {fighter_url}")
        return {}
    
    details = {}

    # Get fighter bio information
    bio_box = fighter_soup.find('div', class_='b-list__info-box')
    if bio_box:
        bio_items = bio_box.find_all('li', class_='b-list__box-list-item')
        for item in bio_items:
            label_tag = item.find('i')
            if not label_tag:
                continue

            label_raw = clean_text(label_tag.text)          # e.g., 'DOB:'
            label_key = label_raw.rstrip(':').lower()       # 'dob'

            # Full text still includes the label; strip it out
            value_full = clean_text(item.get_text(separator=' '))
            value_only = strip_label(value_full, label_raw) # e.g., 'Jan 01, 1990'

            if label_key == 'height':
                details['height'] = value_only
            elif label_key == 'weight':
                details['weight'] = value_only
            elif label_key == 'reach':
                details['reach'] = value_only
            elif label_key == 'stance':
                details['stance'] = value_only
            elif label_key == 'dob':
                details['dob'] = value_only


    
    # Get career statistics
    career_stats = fighter_soup.find_all('div', class_='b-list__info-box-left')
    for stat_box in career_stats:
        stat_items = stat_box.find_all('li', class_='b-list__box-list-item')
        for item in stat_items:
            label_elem = item.find('i')
            if label_elem and label_elem.text:
                label = clean_text(label_elem.text)
                value = clean_text(item.text.replace(label, ''))
                
                if 'SLpM' in label:
                    details['strikes_landed_per_min'] = value
                elif 'Str. Acc' in label:
                    details['striking_accuracy'] = value
                elif 'SApM' in label:
                    details['strikes_absorbed_per_min'] = value
                elif 'Str. Def' in label:
                    details['striking_defense'] = value
                elif 'TD Avg' in label:
                    details['takedown_avg'] = value
                elif 'TD Acc' in label:
                    details['takedown_accuracy'] = value
                elif 'TD Def' in label:
                    details['takedown_defense'] = value
                elif 'Sub. Avg' in label:
                    details['submission_avg'] = value
    
    return details

In [None]:
# Scrape Fighter Data

def get_fighter_data(limit = None): 
    base_url = "http://ufcstats.com/statistics/fighters"
    fighters_data = []
    fighters_collected_count = 0

    # Get all letter pages (a-z)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        # OPTIMIZATION: If we hit the limit, stop checking other letters
        if limit and fighters_collected_count >= limit:
            break

        letter_url = f"{base_url}?char={letter}&page=all"
        print(f"\nScraping fighters starting with '{letter.upper()}' from {letter_url}")
    
        soup = get_soup(letter_url)
        if not soup:
            print(f"Failed to retrieve data for letter '{letter.upper()}'. Skipping...")
            continue

        # Find fighter table
        table = soup.find('table', class_='b-statistics__table')
        if not table:
            continue
        
        rows = table.find_all('tr')[1:]  # Skip header row
        
        for row in rows:

            # Stop Check 2: Check limit before processing each fighter
            if limit and fighters_collected_count >= limit:
                break

            cols = row.find_all('td')
            if len(cols) >= 10:
                fighter_info = {
                    'fighter_id': cols[0].find('a')['href'].split('/')[-1] if cols[0].find('a') else None,
                    'firstname': clean_text(cols[0].text),
                    'lastname' : clean_text(cols[1].text),
                    'nickname': clean_text(cols[2].text),
                    
                    'wins': clean_text(cols[7].text),
                    'losses': clean_text(cols[8].text),
                    'draws': clean_text(cols[9].text),
                }

                # Get fighter url and parse additional details from fighters main page
                fighter_url = f"http://ufcstats.com/fighter-details/{fighter_info['fighter_id']}"
                fighter_details = get_fighter_advanced_details(fighter_url)
                
                fighter_full = {**fighter_info, **fighter_details}

                all_fields = list(fighter_full.values())
                
                missing = any(field == "" or field == "--" for field in all_fields)
                if missing:
                    print(f"Skipping incomplete data for fighter: {fighter_full}") #can change to return first and last names
                    continue

                fighters_data.append(fighter_full)
                fighters_collected_count += 1
                time.sleep(0.5)
        
        # Be respectful with request rate
        time.sleep(1)
    
    print(f"Total fighters found: {len(fighters_data)}")
    return fighters_data

In [None]:
# Run the scrape with a limit of 10
data = get_fighter_data(limit=10)

# Convert to pandas DataFrame to view clearly
df = pd.DataFrame(data)
df.to_csv('fighter_stats_raw.csv', index=False)
display(df)
print(df.dtypes)

In [None]:
# Clean UFC fighter CSV -> numeric features to numeric types(except id/name fields)
#
#Variable Legend:
#column name:               short:              Description:
#fighter_id	                fighter_id	        Unique identifier for fighter (end of URL)
#firstname	                first_name	        Fighter given name	
#lastname	                last_name	        Fighter family/surname	
#nickname	                nickname	        Fighter nickname/alias	
#wins	                    wins	            Official wins	
#losses	                    losses	            Official losses	
#draws	                    draws           	Official draws	
#height                 	height	            Stated height in inches
#weight	                    weight          	Stated weight in lb
#reach	                    reach	            Arm reach in inches
#stance	                    stance	            Primary fighting stance (e.g., Orthodox, Southpaw, Switch)	
#dob	                    dob	                Date of birth	YYYY-MM-DD
#strikes_landed_per_min	    SLpM	            Significant Strikes Landed per Minute	strikes/min
#striking_accuracy	        Str. Acc.	        Significant Striking Accuracy	percent (0–100)
#strikes_absorbed_per_min	SApM	            Significant Strikes Absorbed per Minute	strikes/min
#striking_defense	        Str. Def.	        Significant Strike Defence (opponent strikes that did NOT land)	percent (0–100)
#takedown_avg	            TD Avg.	            Average Takedowns Landed per 15 minutes	takedowns/15 min
#takedown_accuracy	        TD Acc.	            Takedown Accuracy percent (0–100)
#takedown_defense	        TD Def.	            Takedown Defence (opponent TD attempts that did NOT land)	percent (0–100)
#submission_avg	            Sub. Avg.	        Average Submissions Attempted per 15 minutes	subs/15 min

import pandas as pd
import numpy as np
import re
from pathlib import Path

# --------- CONFIG (edit these) ----------
INPUT_CSV  = "fighter_stats_raw.csv"           # your source file
OUTPUT_CSV = "fighter_stats_cleaned.csv"   # output file

# stance mapping you requested: 0/1/2
STANCE_MAP = {"orthodox": 0, "southpaw": 1, "switch": 2}

# --------- HELPERS ----------
def height_to_inches(s):
    if pd.isna(s): return np.nan
    t = str(s).strip().lower()
    # 173 cm -> inches
    m = re.search(r'(\d+(?:\.\d+)?)\s*cm', t)
    if m: return float(m.group(1)) / 2.54
    # 5' 8", 5'8", 5'8, 5 ft 8 in
    m = re.search(r'(\d+)\s*(?:ft|feet)?\s*[\']\s*(\d+)\s*(?:in|")?', t)
    if m: return float(m.group(1))*12 + float(m.group(2))
    # 68", 68 in
    m = re.search(r'(\d+(?:\.\d+)?)\s*(?:in|")', t)
    if m: return float(m.group(1))
    # bare number => assume inches
    m = re.search(r'^\s*(\d+(?:\.\d+)?)\s*$', t)
    if m: return float(m.group(1))
    return np.nan

def weight_to_lbs(s):
    if pd.isna(s): return np.nan
    t = str(s).strip().lower()
    # 70 kg -> lb
    m = re.search(r'(\d+(?:\.\d+)?)\s*kg', t)
    if m: return float(m.group(1)) * 2.2046226218
    # 155 lbs., 155 lb
    m = re.search(r'(\d+(?:\.\d+)?)\s*(?:lb|lbs|pounds)\.?', t)
    if m: return float(m.group(1))
    # bare number => assume pounds
    m = re.search(r'^\s*(\d+(?:\.\d+)?)\s*$', t)
    if m: return float(m.group(1))
    return np.nan

def reach_to_inches(s):
    if pd.isna(s): return np.nan
    t = str(s).strip().lower()
    m = re.search(r'(\d+(?:\.\d+)?)\s*cm', t)
    if m: return float(m.group(1)) / 2.54
    m = re.search(r'(\d+(?:\.\d+)?)\s*(?:in|")', t)
    if m: return float(m.group(1))
    m = re.search(r'^\s*(\d+(?:\.\d+)?)\s*$', t)  # bare number => inches
    if m: return float(m.group(1))
    return np.nan

def extract_year(s):
    if pd.isna(s): return np.nan
    m = re.search(r'(\d{4})', str(s))
    return float(m.group(1)) if m else np.nan

def stance_to_code(s):
    if pd.isna(s): return np.nan
    return STANCE_MAP.get(str(s).strip().lower(), np.nan)  # unknown -> NaN

def pct_to_float(s):
    if pd.isna(s): return np.nan
    return pd.to_numeric(str(s).replace('%','').strip(), errors='coerce')

def to_float(s):
    return pd.to_numeric(s, errors='coerce')

def find_col(df, candidates):
    """Return the first existing column matching any candidate (case-insensitive)."""
    lut = {c.lower(): c for c in df.columns}
    for cand in candidates:
        c = lut.get(cand.lower())
        if c is not None:
            return c
    return None

# --------- LOAD ----------
df = pd.read_csv(INPUT_CSV)

# --------- CONVERSIONS ----------
# Height -> inches
c = find_col(df, ["height"])
if c: df[c] = df[c].apply(height_to_inches)

# Weight -> pounds
c = find_col(df, ["weight"])
if c: df[c] = df[c].apply(weight_to_lbs)

# Reach -> inches
c = find_col(df, ["reach"])
if c: df[c] = df[c].apply(reach_to_inches)

# Stance -> 0/1/2
c = find_col(df, ["stance"])
if c: df[c] = df[c].apply(stance_to_code)

# DOB -> year only
c = find_col(df, ["dob", "date_of_birth", "birthdate"])
if c: df[c] = df[c].apply(extract_year)

# Percent columns (handle common typos too)
for group in [
    ["striking_accuracy", "striking_accuarcy"],
    ["striking_defense", "striking_defence"],
    ["takedown_accuracy"],
    ["takedown_defense", "takedown_defence"],
]:
    c = find_col(df, group)
    if c: df[c] = df[c].apply(pct_to_float)

# Rate/average columns
for group in [
    ["strikes_landed_per_min"],
    ["strikes_absorbed_per_min"],
    ["takedown_avg", "takedown_average"],
    ["submission_avg", "submission_average"],
]:
    c = find_col(df, group)
    if c: df[c] = df[c].apply(to_float)

# W/L/D
for group in [["wins"], ["losses"], ["draws"]]:
    c = find_col(df, group)
    if c: df[c] = pd.to_numeric(df[c], errors="coerce")

# --------- ENFORCE: everything numeric except these 4 ----------
keep_object = set()
for opts in [["fighter_id"], ["firstname", "first_name"], ["lastname", "last_name"], ["nickname"]]:
    c = find_col(df, opts)
    if c: keep_object.add(c)

for col in df.columns:
    if col not in keep_object:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# --------- SAVE & REPORT ----------
df.to_csv(OUTPUT_CSV, index=False)

print("Saved:", OUTPUT_CSV)
print("\nNon-numeric columns kept as text:", sorted(keep_object))
print(df.dtypes.head(25))
