In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("ufc_comprehensive_data.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0_level_0,lastname,fighter_url,nickname,height,weight,reach,stance,wins,losses,draws,...,strikes_landed_per_min,striking_accuracy,strikes_absorbed_per_min,striking_defense,takedown_avg,takedown_accuracy,takedown_defense,submission_avg,total_ufc_fights,fight_history
firstname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Tom,Aaron,http://ufcstats.com/fighter-details/93fe7332d1...,,--,155 lbs.,--,,5,3,0,...,0.0,0%,0.0,0%,0.0,0%,0%,0.0,63,"[{'result': 'loss', 'opponent': 'Tom Aaron Mat..."
Danny,Abbadi,http://ufcstats.com/fighter-details/15df64c02b...,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,...,3.29,38%,4.41,57%,0.0,0%,77%,0.0,67,"[{'result': 'loss', 'opponent': 'Danny Abbadi ..."
Nariman,Abbasov,http://ufcstats.com/fighter-details/59a9d6dac6...,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,...,3.0,20%,5.67,46%,0.0,0%,66%,0.0,1,"[{'result': 'loss', 'opponent': 'Nariman Abbas..."
Darion,Abbey,http://ufcstats.com/fighter-details/4961467134...,,"6' 2""",265 lbs.,"80.0""",Orthodox,9,5,0,...,8.44,50%,14.06,28%,0.0,0%,0%,0.0,1,"[{'result': 'loss', 'opponent': 'Darion Abbey ..."
David,Abbott,http://ufcstats.com/fighter-details/b361180739...,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,...,1.35,30%,3.55,38%,1.07,33%,66%,0.0,114,"[{'result': 'loss', 'opponent': 'David Abbott ..."


In [4]:
matches.shape

(4447, 27)

In [5]:
matches.dtypes

lastname                     object
fighter_url                  object
nickname                     object
height                       object
weight                       object
reach                        object
stance                       object
wins                          int64
losses                        int64
draws                         int64
belt                        float64
full_name                    object
height_detail                object
weight_detail                object
reach_detail                 object
stance_detail                object
dob                          object
strikes_landed_per_min      float64
striking_accuracy            object
strikes_absorbed_per_min    float64
striking_defense             object
takedown_avg                float64
takedown_accuracy            object
takedown_defense             object
submission_avg              float64
total_ufc_fights              int64
fight_history                object
dtype: object

In [6]:
import pandas as pd
import numpy as np

# Load the data
matches = pd.read_csv("ufc_comprehensive_data.csv", index_col=0)

# Create a copy to work with
matches_cleaned = matches.copy()

# --- Ensure no *_detail columns exist ---
cols_to_drop = ['height_detail', 'weight_detail', 'reach_detail', 'stance_detail', 'total_ufc_fights', 'belt']
matches_cleaned = matches_cleaned.drop(columns=[c for c in cols_to_drop if c in matches_cleaned.columns], errors='ignore')

# --- Replace 'dob' with 'age' (as of 2025-10-17) ---
REF_DATE = pd.Timestamp(2025, 10, 17)  # "today"

def dob_to_age(dob_str, ref=REF_DATE):
    if pd.isna(dob_str) or str(dob_str).strip() == '':
        return np.nan
    # Parse strings like "Jul 13, 1978"
    dob = pd.to_datetime(str(dob_str).strip(), format="%b %d, %Y", errors="coerce")
    if pd.isna(dob):
        return np.nan
    years = ref.year - dob.year
    # Subtract 1 if birthday hasn't occurred yet in the reference year
    if (ref.month, ref.day) < (dob.month, dob.day):
        years -= 1
    return years

if 'dob' in matches_cleaned.columns:
    matches_cleaned['age'] = matches_cleaned['dob'].apply(dob_to_age).astype('Int64')
    matches_cleaned = matches_cleaned.drop(columns=['dob'])
else:
    print("Warning: 'dob' column not found; skipping age calculation.")

# Function to clean and convert height to inches
def clean_height(height_str):
    if pd.isna(height_str) or height_str == '':
        return np.nan
    height_str = str(height_str).strip()
    # Check if it's in feet and inches format (e.g., "5'11")
    if "'" in height_str:
        try:
            parts = height_str.replace('"', '').split("'")
            feet = float(parts[0])
            inches = float(parts[1]) if len(parts) > 1 and parts[1] else 0
            total_inches = feet * 12 + inches
            return total_inches
        except:
            return np.nan
    else:
        # Already in inches
        try:
            inches = float(height_str.replace('"', '').replace('in', '').strip())
            return inches
        except:
            return np.nan

# Function to clean weight - updated to handle various formats
def clean_weight(weight_str):
    if pd.isna(weight_str):
        return np.nan
    if isinstance(weight_str, (int, float)):
        return float(weight_str)
    weight_str = str(weight_str).strip()
    if weight_str == '' or weight_str.lower() == 'nan':
        return np.nan
    try:
        cleaned = weight_str.replace('lbs', '').replace('lb', '').replace('pounds', '').strip()
        cleaned = cleaned.replace(' ', '')
        return float(cleaned)
    except:
        try:
            return float(weight_str)
        except:
            return np.nan

# Function to clean reach in inches
def clean_reach(reach_str):
    if pd.isna(reach_str) or reach_str == '':
        return np.nan
    reach_str = str(reach_str).strip()
    try:
        inches = float(reach_str.replace('"', '').replace('in', '').strip())
        return inches
    except:
        return np.nan

# Function to clean percentage values
def clean_percentage(perc_str):
    if pd.isna(perc_str) or perc_str == '':
        return np.nan
    perc_str = str(perc_str).strip()
    try:
        cleaned = perc_str.replace('%', '').strip()
        return float(cleaned)
    except:
        return np.nan

# Debug: Check original weight column values before conversion
print("Original weight column sample (first 10 non-null values):")
print("-" * 40)
weight_sample = matches['weight'].dropna().head(10)
print(weight_sample)
print(f"Weight column dtype: {matches['weight'].dtype}")
print()

# Convert height (inches), weight (lbs), and reach (inches)
matches_cleaned['height'] = matches_cleaned['height'].apply(clean_height)
matches_cleaned['weight'] = matches_cleaned['weight'].apply(clean_weight)
matches_cleaned['reach'] = matches_cleaned['reach'].apply(clean_reach)

# Convert stance to numeric (1=orthodox, 2=southpaw, 3=switch)
stance_mapping = {
    'orthodox': 1,
    'southpaw': 2,
    'switch': 3
}

def convert_stance(stance_str):
    if pd.isna(stance_str) or stance_str == '':
        return np.nan
    stance_str = str(stance_str).lower().strip()
    return stance_mapping.get(stance_str, np.nan)

matches_cleaned['stance'] = matches_cleaned['stance'].apply(convert_stance)

# Convert accuracy and defense percentages
matches_cleaned['striking_accuracy'] = matches_cleaned['striking_accuracy'].apply(clean_percentage)
matches_cleaned['striking_defense'] = matches_cleaned['striking_defense'].apply(clean_percentage)
matches_cleaned['takedown_accuracy'] = matches_cleaned['takedown_accuracy'].apply(clean_percentage)
matches_cleaned['takedown_defense'] = matches_cleaned['takedown_defense'].apply(clean_percentage)

# Verify the conversions
print("Data types after conversion:")
print("-" * 40)
columns_to_check = ['height', 'weight', 'reach', 'stance', 
                    'striking_accuracy', 'striking_defense', 
                    'takedown_accuracy', 'takedown_defense']

for col in columns_to_check:
    print(f"{col}: {matches_cleaned[col].dtype}")

print("\nSample of converted data:")
print("-" * 40)
print(matches_cleaned[columns_to_check].head(10))

print("\nStatistics for numeric columns:")
print("-" * 40)
if matches_cleaned['height'].notna().any():
    print(f"Height (inches): min={matches_cleaned['height'].min():.1f}, max={matches_cleaned['height'].max():.1f}, mean={matches_cleaned['height'].mean():.1f}")
else:
    print("Height: No valid data")

if matches_cleaned['weight'].notna().any():
    print(f"Weight (lbs): min={matches_cleaned['weight'].min():.1f}, max={matches_cleaned['weight'].max():.1f}, mean={matches_cleaned['weight'].mean():.1f}")
else:
    print("Weight: No valid data")

if matches_cleaned['reach'].notna().any():
    print(f"Reach (inches): min={matches_cleaned['reach'].min():.1f}, max={matches_cleaned['reach'].max():.1f}, mean={matches_cleaned['reach'].mean():.1f}")
else:
    print("Reach: No valid data")

print("\nMissing values in converted columns:")
print("-" * 40)
for col in columns_to_check:
    missing = matches_cleaned[col].isna().sum()
    total = len(matches_cleaned)
    print(f"{col}: {missing} missing ({missing/total*100:.1f}%)")

# Additional debug for weight column if still having issues
print("\nWeight column debugging:")
print("-" * 40)
print(f"Total rows: {len(matches_cleaned)}")
print(f"Non-null weight values: {matches_cleaned['weight'].notna().sum()}")
print(f"Null weight values: {matches_cleaned['weight'].isna().sum()}")
if matches_cleaned['weight'].notna().any():
    print(f"Sample of converted weight values:")
    print(matches_cleaned['weight'].dropna().head(10))

# Organize Columns
desired = [
    'name','lastname','nickname','full_name',
    'height','age','weight','reach','stance','wins','losses','draws',
    'strikes_landed_per_min','striking_accuracy','strikes_absorbed_per_min','striking_defense','takedown_avg',
    'takedown_accuracy','takedown_defense','submission_avg',
    'fighter_url','fight_history'
]
ordered = [c for c in desired if c in matches_cleaned.columns] + \
          [c for c in matches_cleaned.columns if c not in desired]

matches_cleaned = matches_cleaned[ordered]


# Save the cleaned data to a new CSV file
output_filename = "ufc_data_numeric_converted.csv"
matches_cleaned.to_csv(output_filename)
print(f"\nData saved to: {output_filename}")

# Display all data types to confirm changes
print("\nAll data types in the cleaned dataset:")
print("-" * 40)
print(matches_cleaned.dtypes)


Original weight column sample (first 10 non-null values):
----------------------------------------
firstname
Tom         155 lbs.
Danny       155 lbs.
Nariman     155 lbs.
Darion      265 lbs.
David       265 lbs.
Hamdy       264 lbs.
Mansur      185 lbs.
Shamil      235 lbs.
Hiroyuki    145 lbs.
Daichi      170 lbs.
Name: weight, dtype: object
Weight column dtype: object

Data types after conversion:
----------------------------------------
height: float64
weight: float64
reach: float64
stance: float64
striking_accuracy: float64
striking_defense: float64
takedown_accuracy: float64
takedown_defense: float64

Sample of converted data:
----------------------------------------
           height  weight  reach  stance  striking_accuracy  striking_defense  \
firstname                                                                       
Tom           NaN   155.0    NaN     NaN                0.0               0.0   
Danny        71.0   155.0    NaN     1.0               38.0              5