In [1]:
# UFC Data Analysis - Fighters Data Cleaning
# Notebook 3: Limpieza de datos de luchadores

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 1. CARGAR DATOS
print("\n1. CARGANDO DATOS ORIGINALES...")
try:
    fighters_df = pd.read_csv('../data/raw/raw_fighters.csv')
    print(f"✓ {len(fighters_df)} luchadores cargados")
    print(f"Columnas: {list(fighters_df.columns)}")
except FileNotFoundError:
    print("❌ Error: No se encontró fighters.csv")

# Crear copia para trabajar
fighters_clean = fighters_df.copy()

print("\n2. ESTADO INICIAL DEL DATASET")
print("-" * 30)
print(f"Shape inicial: {fighters_clean.shape}")
print(f"Valores nulos por columna:")
print(fighters_clean.isnull().sum())


1. CARGANDO DATOS ORIGINALES...
✓ 4443 luchadores cargados
Columnas: ['fighter_id', 'first', 'last', 'nickname', 'height', 'weight', 'reach', 'stance', 'wins', 'defeats', 'draws', 'belt', 'dob', 'slpm', 'str_acc', 'sapm', 'str_def', 'td_avg', 'td_acc', 'td_def', 'sub_avg']

2. ESTADO INICIAL DEL DATASET
------------------------------
Shape inicial: (4443, 21)
Valores nulos por columna:
fighter_id       0
first           16
last             0
nickname      1979
height         354
weight          86
reach         1976
stance         869
wins             0
defeats          0
draws            0
belt             0
dob            762
slpm             0
str_acc          0
sapm             0
str_def          0
td_avg           0
td_acc           0
td_def           0
sub_avg          0
dtype: int64


In [3]:
fighters_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4443 entries, 0 to 4442
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   fighter_id  4443 non-null   object 
 1   first       4427 non-null   object 
 2   last        4443 non-null   object 
 3   nickname    2464 non-null   object 
 4   height      4089 non-null   object 
 5   weight      4357 non-null   object 
 6   reach       2467 non-null   object 
 7   stance      3574 non-null   object 
 8   wins        4443 non-null   int64  
 9   defeats     4443 non-null   int64  
 10  draws       4443 non-null   int64  
 11  belt        4443 non-null   bool   
 12  dob         3681 non-null   object 
 13  slpm        4443 non-null   float64
 14  str_acc     4443 non-null   object 
 15  sapm        4443 non-null   float64
 16  str_def     4443 non-null   object 
 17  td_avg      4443 non-null   float64
 18  td_acc      4443 non-null   object 
 19  td_def      4443 non-null  

In [4]:
# 3. LIMPIEZA DE NOMBRES
def clean_name(name):
    """Limpia y normaliza nombres de luchadores"""
    if pd.isna(name):
        return name
    # Convertir a string y limpiar espacios
    name = str(name).strip()
    # Eliminar espacios múltiples
    name = re.sub(r'\s+', ' ', name)
    # Capitalizar correctamente
    name = name.title()
    return name

# Limpiar nombres y apellidos
print("Limpiando nombres y apellidos...")
fighters_clean['first'] = fighters_clean['first'].apply(clean_name)
fighters_clean['last'] = fighters_clean['last'].apply(clean_name)

# Crear nombre completo
full_name_column = fighters_clean['first'].astype(str) + ' ' + fighters_clean['last'].astype(str)
full_name_column = full_name_column.str.replace('nan ', '', regex=False).str.replace(' nan', '', regex=False).str.strip()


# Insertar 'full_name' en lugar de 'first'
col_index = fighters_clean.columns.get_loc('first')
fighters_clean.insert(col_index, 'full_name', full_name_column)



# Limpiar apodos
def clean_nickname(nickname):
    """Limpia apodos eliminando caracteres especiales"""
    if pd.isna(nickname) or str(nickname).lower() in ['', 'nan', 'none']:
        return None
    nickname = str(nickname).strip()
    # Eliminar comillas si las hay
    nickname = nickname.strip('"\'')
    return nickname if nickname else None

fighters_clean['nickname'] = fighters_clean['nickname'].apply(clean_nickname)

print(f"✓ Nombres limpios. Apodos válidos: {fighters_clean['nickname'].notna().sum()}")



Limpiando nombres y apellidos...
✓ Nombres limpios. Apodos válidos: 2464


In [5]:
fighters_clean.head(5)

Unnamed: 0,fighter_id,full_name,first,last,nickname,height,weight,reach,stance,wins,...,belt,dob,slpm,str_acc,sapm,str_def,td_avg,td_acc,td_def,sub_avg
0,ee457ef1e1c326c1,Ben Earwood,Ben,Earwood,,"5' 8""",170 lbs.,,Orthodox,12,...,False,,0.5,55%,0.2,33%,3.0,100%,0%,0.0
1,5713c1d2fac539ac,Cody East,Cody,East,The Freight Train,"6' 3""",245 lbs.,,Orthodox,12,...,False,"Jun 29, 1988",4.15,43%,7.32,40%,3.97,100%,16%,0.0
2,05866d8c3a321856,Marvin Eastman,Marvin,Eastman,The Beastman,"5' 9""",185 lbs.,"73.0""",Orthodox,18,...,False,"Jun 08, 1971",2.21,35%,3.92,42%,1.38,33%,81%,0.0
3,bc7230f231701d66,Mike Easton,Mike,Easton,The Hulk,"5' 6""",135 lbs.,"70.0""",Southpaw,13,...,False,"Jan 25, 1984",3.51,37%,4.28,61%,0.75,22%,66%,0.0
4,4acb99524a9a81ab,Maurice Eazel,Maurice,Eazel,,"5' 7""",135 lbs.,,,7,...,False,,0.0,0%,0.0,0%,0.0,0%,0%,0.0


In [6]:
# Eliminar columnas  sobrantes 'first' y 'last'
fighters_clean.drop(columns=['first', 'last'], inplace=True)
fighters_clean.head(5)

Unnamed: 0,fighter_id,full_name,nickname,height,weight,reach,stance,wins,defeats,draws,belt,dob,slpm,str_acc,sapm,str_def,td_avg,td_acc,td_def,sub_avg
0,ee457ef1e1c326c1,Ben Earwood,,"5' 8""",170 lbs.,,Orthodox,12,3,0,False,,0.5,55%,0.2,33%,3.0,100%,0%,0.0
1,5713c1d2fac539ac,Cody East,The Freight Train,"6' 3""",245 lbs.,,Orthodox,12,3,0,False,"Jun 29, 1988",4.15,43%,7.32,40%,3.97,100%,16%,0.0
2,05866d8c3a321856,Marvin Eastman,The Beastman,"5' 9""",185 lbs.,"73.0""",Orthodox,18,15,1,False,"Jun 08, 1971",2.21,35%,3.92,42%,1.38,33%,81%,0.0
3,bc7230f231701d66,Mike Easton,The Hulk,"5' 6""",135 lbs.,"70.0""",Southpaw,13,6,0,False,"Jan 25, 1984",3.51,37%,4.28,61%,0.75,22%,66%,0.0
4,4acb99524a9a81ab,Maurice Eazel,,"5' 7""",135 lbs.,,,7,15,0,False,,0.0,0%,0.0,0%,0.0,0%,0%,0.0


In [7]:
# 4. LIMPIEZA DE DATOS FÍSICOS (CONVERSIÓN A SISTEMA MÉTRICO)

def parse_height_to_cm(height_str):
    """Convierte altura de formato '5\' 8\"' a centímetros"""
    if pd.isna(height_str):
        return np.nan

    height_str = str(height_str).strip()
    pattern = r"(\d+)'\s*(\d+)\"?"
    match = re.search(pattern, height_str)

    if match:
        feet = int(match.group(1))
        inches = int(match.group(2))
        total_inches = feet * 12 + inches
        return round(total_inches * 2.54, 1)
    else:
        # Intentar extraer número aislado (asumir pulgadas)
        numbers = re.findall(r'\d+', height_str)
        if numbers:
            inches = int(numbers[0])
            return round(inches * 2.54, 1)
        return np.nan

def parse_weight_to_kg(weight_str):
    """Convierte peso de libras a kilogramos"""
    if pd.isna(weight_str):
        return np.nan

    weight_str = str(weight_str)
    match = re.search(r'(\d+(?:\.\d+)?)', weight_str)
    if match:
        weight_lbs = float(match.group(1))
        return round(weight_lbs * 0.453592, 1)
    return np.nan

def parse_reach_to_cm(reach_str):
    """Convierte alcance de pulgadas a centímetros"""
    if pd.isna(reach_str):
        return np.nan

    reach_str = str(reach_str)
    match = re.search(r'(\d+(?:\.\d+)?)', reach_str)
    if match:
        reach_inches = float(match.group(1))
        return round(reach_inches * 2.54, 1)
    return np.nan

# Aplicar funciones y sobrescribir columnas originales
fighters_clean['height'] = fighters_clean['height'].apply(parse_height_to_cm)
fighters_clean['weight'] = fighters_clean['weight'].apply(parse_weight_to_kg)
fighters_clean['reach'] = fighters_clean['reach'].apply(parse_reach_to_cm)

# Verificar resultados
print(f"✓ Alturas procesadas: {fighters_clean['height'].notna().sum()}/{len(fighters_clean)}")
print(f"✓ Pesos procesados: {fighters_clean['weight'].notna().sum()}/{len(fighters_clean)}")
print(f"✓ Alcances procesados: {fighters_clean['reach'].notna().sum()}/{len(fighters_clean)}")

✓ Alturas procesadas: 4089/4443
✓ Pesos procesados: 4357/4443
✓ Alcances procesados: 2467/4443


In [8]:
# 5. LIMPIEZA DE STANCE
def clean_stance(stance):
    """Normaliza valores de stance"""
    if pd.isna(stance):
        return 'Unknown'
    
    stance = str(stance).strip().title()
    
    # Mapear variaciones comunes
    stance_mapping = {
        'Orthodox': 'Orthodox',
        'Southpaw': 'Southpaw', 
        'Switch': 'Switch',
        'Open Stance': 'Open Stance',
        'Sideways': 'Sideways',
        '': 'Unknown',
        'Nan': 'Unknown',
        'None': 'Unknown'
    }
    
    return stance_mapping.get(stance, stance)

# Aplicar limpieza directamente en la columna original
fighters_clean['stance'] = fighters_clean['stance'].apply(clean_stance)

# Ver distribución resultante
stance_counts = fighters_clean['stance'].value_counts()
print("Distribución de stance:")
print(stance_counts)

Distribución de stance:
stance
Orthodox       2741
Unknown         869
Southpaw        605
Switch          218
Open Stance       7
Sideways          3
Name: count, dtype: int64


In [9]:
fighters_clean.head(5)

Unnamed: 0,fighter_id,full_name,nickname,height,weight,reach,stance,wins,defeats,draws,belt,dob,slpm,str_acc,sapm,str_def,td_avg,td_acc,td_def,sub_avg
0,ee457ef1e1c326c1,Ben Earwood,,172.7,77.1,,Orthodox,12,3,0,False,,0.5,55%,0.2,33%,3.0,100%,0%,0.0
1,5713c1d2fac539ac,Cody East,The Freight Train,190.5,111.1,,Orthodox,12,3,0,False,"Jun 29, 1988",4.15,43%,7.32,40%,3.97,100%,16%,0.0
2,05866d8c3a321856,Marvin Eastman,The Beastman,175.3,83.9,185.4,Orthodox,18,15,1,False,"Jun 08, 1971",2.21,35%,3.92,42%,1.38,33%,81%,0.0
3,bc7230f231701d66,Mike Easton,The Hulk,167.6,61.2,177.8,Southpaw,13,6,0,False,"Jan 25, 1984",3.51,37%,4.28,61%,0.75,22%,66%,0.0
4,4acb99524a9a81ab,Maurice Eazel,,170.2,61.2,,Unknown,7,15,0,False,,0.0,0%,0.0,0%,0.0,0%,0%,0.0


In [10]:
# Definir columnas a validar
record_columns = ['wins', 'defeats', 'draws']

for col in record_columns:
    if col in fighters_clean.columns:
        fighters_clean[col] = pd.to_numeric(fighters_clean[col], errors='coerce').fillna(0).clip(lower=0).astype(float)

# Calcular total de peleas
columns_to_sum = ['wins', 'defeats', 'draws']
fighters_clean['total_fights'] = fighters_clean[columns_to_sum].sum(axis=1)
fighters_clean.head(5)

Unnamed: 0,fighter_id,full_name,nickname,height,weight,reach,stance,wins,defeats,draws,...,dob,slpm,str_acc,sapm,str_def,td_avg,td_acc,td_def,sub_avg,total_fights
0,ee457ef1e1c326c1,Ben Earwood,,172.7,77.1,,Orthodox,12.0,3.0,0.0,...,,0.5,55%,0.2,33%,3.0,100%,0%,0.0,15.0
1,5713c1d2fac539ac,Cody East,The Freight Train,190.5,111.1,,Orthodox,12.0,3.0,0.0,...,"Jun 29, 1988",4.15,43%,7.32,40%,3.97,100%,16%,0.0,15.0
2,05866d8c3a321856,Marvin Eastman,The Beastman,175.3,83.9,185.4,Orthodox,18.0,15.0,1.0,...,"Jun 08, 1971",2.21,35%,3.92,42%,1.38,33%,81%,0.0,34.0
3,bc7230f231701d66,Mike Easton,The Hulk,167.6,61.2,177.8,Southpaw,13.0,6.0,0.0,...,"Jan 25, 1984",3.51,37%,4.28,61%,0.75,22%,66%,0.0,19.0
4,4acb99524a9a81ab,Maurice Eazel,,170.2,61.2,,Unknown,7.0,15.0,0.0,...,,0.0,0%,0.0,0%,0.0,0%,0%,0.0,22.0


In [11]:

# 7. LIMPIEZA DE FECHA DE NACIMIENTO

def parse_dob(dob_str):
    """Convierte fecha de nacimiento a datetime usando formato fijo '%b %d, %Y'"""
    if pd.isna(dob_str):
        return pd.NaT
    
    dob_str = str(dob_str).strip()
    
    try:
        return pd.to_datetime(dob_str, format='%b %d, %Y', errors='raise')
    except:
        return pd.NaT

# Sobrescribir dob con la versión datetime limpia
fighters_clean['dob'] = fighters_clean['dob'].apply(parse_dob)

# Calcular edad actual
current_year = datetime.now().year
fighters_clean['age'] = fighters_clean['dob'].apply(
    lambda x: current_year - x.year if pd.notna(x) else np.nan
)

fighters_clean.head(5)

Unnamed: 0,fighter_id,full_name,nickname,height,weight,reach,stance,wins,defeats,draws,...,slpm,str_acc,sapm,str_def,td_avg,td_acc,td_def,sub_avg,total_fights,age
0,ee457ef1e1c326c1,Ben Earwood,,172.7,77.1,,Orthodox,12.0,3.0,0.0,...,0.5,55%,0.2,33%,3.0,100%,0%,0.0,15.0,
1,5713c1d2fac539ac,Cody East,The Freight Train,190.5,111.1,,Orthodox,12.0,3.0,0.0,...,4.15,43%,7.32,40%,3.97,100%,16%,0.0,15.0,37.0
2,05866d8c3a321856,Marvin Eastman,The Beastman,175.3,83.9,185.4,Orthodox,18.0,15.0,1.0,...,2.21,35%,3.92,42%,1.38,33%,81%,0.0,34.0,54.0
3,bc7230f231701d66,Mike Easton,The Hulk,167.6,61.2,177.8,Southpaw,13.0,6.0,0.0,...,3.51,37%,4.28,61%,0.75,22%,66%,0.0,19.0,41.0
4,4acb99524a9a81ab,Maurice Eazel,,170.2,61.2,,Unknown,7.0,15.0,0.0,...,0.0,0%,0.0,0%,0.0,0%,0%,0.0,22.0,


In [12]:
# 8. LIMPIEZA DE ESTADÍSTICAS DE COMBATE
# Estadísticas a limpiar
combat_stats = ['slpm', 'str_acc', 'sapm', 'str_def', 'td_avg', 'td_acc', 'td_def', 'sub_avg']

for stat in combat_stats:
    if stat in fighters_clean.columns:
        # Convertir porcentajes (eliminar % y dividir por 100)
        if stat in ['str_acc', 'str_def', 'td_acc', 'td_def']:
            fighters_clean[f'{stat}'] = fighters_clean[stat].astype(str).str.replace('%', '').replace('', np.nan)
            fighters_clean[f'{stat}'] = pd.to_numeric(fighters_clean[f'{stat}'], errors='coerce') / 100
        else:
            fighters_clean[f'{stat}'] = pd.to_numeric(fighters_clean[stat], errors='coerce')

fighters_clean.head(5)

Unnamed: 0,fighter_id,full_name,nickname,height,weight,reach,stance,wins,defeats,draws,...,slpm,str_acc,sapm,str_def,td_avg,td_acc,td_def,sub_avg,total_fights,age
0,ee457ef1e1c326c1,Ben Earwood,,172.7,77.1,,Orthodox,12.0,3.0,0.0,...,0.5,0.55,0.2,0.33,3.0,1.0,0.0,0.0,15.0,
1,5713c1d2fac539ac,Cody East,The Freight Train,190.5,111.1,,Orthodox,12.0,3.0,0.0,...,4.15,0.43,7.32,0.4,3.97,1.0,0.16,0.0,15.0,37.0
2,05866d8c3a321856,Marvin Eastman,The Beastman,175.3,83.9,185.4,Orthodox,18.0,15.0,1.0,...,2.21,0.35,3.92,0.42,1.38,0.33,0.81,0.0,34.0,54.0
3,bc7230f231701d66,Mike Easton,The Hulk,167.6,61.2,177.8,Southpaw,13.0,6.0,0.0,...,3.51,0.37,4.28,0.61,0.75,0.22,0.66,0.0,19.0,41.0
4,4acb99524a9a81ab,Maurice Eazel,,170.2,61.2,,Unknown,7.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,


In [13]:
# Convertir todas las columnas numéricas a float (excepto 'fighter_id', 'full_name', 'nickname', 'stance', 'belt', 'dob')
exclude_cols = ['fighter_id', 'full_name', 'nickname', 'stance', 'belt', 'dob']
numeric_cols = [col for col in fighters_clean.columns if col not in exclude_cols and fighters_clean[col].dtype != 'category']

for col in numeric_cols:
    fighters_clean[col] = pd.to_numeric(fighters_clean[col], errors='coerce').astype(float)

# Convertir 'stance' a tipo category
fighters_clean['stance'] = fighters_clean['stance'].astype('category')

# Asegurar que 'full_name' y 'nickname' sean string (object) en pandas
fighters_clean['full_name'] = fighters_clean['full_name'].astype('string')
fighters_clean['nickname'] = fighters_clean['nickname'].astype('string')

# Establecer 'fighter_id' como índice
fighters_clean.set_index('fighter_id', inplace=True)

fighters_clean.head(5)

Unnamed: 0_level_0,full_name,nickname,height,weight,reach,stance,wins,defeats,draws,belt,...,slpm,str_acc,sapm,str_def,td_avg,td_acc,td_def,sub_avg,total_fights,age
fighter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ee457ef1e1c326c1,Ben Earwood,,172.7,77.1,,Orthodox,12.0,3.0,0.0,False,...,0.5,0.55,0.2,0.33,3.0,1.0,0.0,0.0,15.0,
5713c1d2fac539ac,Cody East,The Freight Train,190.5,111.1,,Orthodox,12.0,3.0,0.0,False,...,4.15,0.43,7.32,0.4,3.97,1.0,0.16,0.0,15.0,37.0
05866d8c3a321856,Marvin Eastman,The Beastman,175.3,83.9,185.4,Orthodox,18.0,15.0,1.0,False,...,2.21,0.35,3.92,0.42,1.38,0.33,0.81,0.0,34.0,54.0
bc7230f231701d66,Mike Easton,The Hulk,167.6,61.2,177.8,Southpaw,13.0,6.0,0.0,False,...,3.51,0.37,4.28,0.61,0.75,0.22,0.66,0.0,19.0,41.0
4acb99524a9a81ab,Maurice Eazel,,170.2,61.2,,Unknown,7.0,15.0,0.0,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,


In [14]:
fighters_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4443 entries, ee457ef1e1c326c1 to 0c277f3ff66b0208
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   full_name     4443 non-null   string        
 1   nickname      2464 non-null   string        
 2   height        4089 non-null   float64       
 3   weight        4357 non-null   float64       
 4   reach         2467 non-null   float64       
 5   stance        4443 non-null   category      
 6   wins          4443 non-null   float64       
 7   defeats       4443 non-null   float64       
 8   draws         4443 non-null   float64       
 9   belt          4443 non-null   bool          
 10  dob           3681 non-null   datetime64[ns]
 11  slpm          4443 non-null   float64       
 12  str_acc       4443 non-null   float64       
 13  sapm          4443 non-null   float64       
 14  str_def       4443 non-null   float64       
 15  td_avg        44

---
### Limpieza y depuración de datos de luchadores

En esta sección se documentan los pasos realizados para limpiar, transformar y depurar el dataset de luchadores. Se explican las decisiones tomadas y se justifica cada transformación aplicada para asegurar la calidad y consistencia de los datos.

In [15]:
# Revisión de valores nulos y resumen general del dataset limpio
print("Valores nulos por columna en fighters_clean:")
print(fighters_clean.isnull().sum())

print("\nPorcentaje de valores nulos por columna:")
print((fighters_clean.isnull().mean() * 100).round(1))

print("\nResumen estadístico de columnas numéricas:")
print(fighters_clean.describe(include=[np.number]))

print("\nResumen de columnas categóricas:")
print(fighters_clean.describe(include=['object', 'bool']))

Valores nulos por columna en fighters_clean:
full_name          0
nickname        1979
height           354
weight            86
reach           1976
stance             0
wins               0
defeats            0
draws              0
belt               0
dob              762
slpm               0
str_acc            0
sapm               0
str_def            0
td_avg             0
td_acc             0
td_def             0
sub_avg            0
total_fights       0
age              762
dtype: int64

Porcentaje de valores nulos por columna:
full_name        0.0
nickname        44.5
height           8.0
weight           1.9
reach           44.5
stance           0.0
wins             0.0
defeats          0.0
draws            0.0
belt             0.0
dob             17.2
slpm             0.0
str_acc          0.0
sapm             0.0
str_def          0.0
td_avg           0.0
td_acc           0.0
td_def           0.0
sub_avg          0.0
total_fights     0.0
age             17.2
dtype: float64

Re

In [16]:

## 12. VALIDACIÓN FINAL

# Eliminar duplicados por el índice (fighter_id)
initial_count = len(fighters_clean)
fighters_clean = fighters_clean[~fighters_clean.index.duplicated(keep='first')]
final_count = len(fighters_clean)

print(f"Registros eliminados por duplicados: {initial_count - final_count}")

# Validar integridad de datos críticos
critical_issues = 0

# Verificar que fighter_id no sea nulo o repetido
null_ids = fighters_clean.index.isnull().sum()
if null_ids > 0:
    print(f"⚠️ {null_ids} registros con fighter_id nulo")
    critical_issues += null_ids

repeated_ids = fighters_clean.index.duplicated().sum()
if repeated_ids > 0:
    print(f"⚠️ {repeated_ids} fighter_id repetidos tras limpieza")
    critical_issues += repeated_ids

# Verificar nombres
null_names = fighters_clean['full_name'].str.strip().eq('').sum()
if null_names > 0:
    print(f"⚠️ {null_names} registros sin nombre completo")

print(f"\n✓ Validación completada. Problemas críticos: {critical_issues}")


Registros eliminados por duplicados: 0

✓ Validación completada. Problemas críticos: 0


In [17]:
# Guardar el dataframe limpio completo
fighters_clean.to_csv('../data/processed/fighters.csv')

# Eliminar columnas de tipo string y 'dob' para dataset ML
ml_cols = [col for col in fighters_clean.columns if fighters_clean[col].dtype != 'string' and col != 'dob']
fighters_ml = fighters_clean[ml_cols]

# Eliminar peleadores sin peso o altura
fighters_ml = fighters_ml.dropna(subset=['weight', 'height'])
# Convertir columnas category a int (codes)
for col in fighters_ml.select_dtypes(['category']).columns:
    fighters_ml[col] = fighters_ml[col].cat.codes

# Convertir columnas bool a int
for col in fighters_ml.select_dtypes(['bool']).columns:
    fighters_ml[col] = fighters_ml[col].astype(int)

fighters_ml.to_csv('../data/ml/fighters.csv')