---
### Limpieza y depuración de datos de eventos

En esta sección se documentan los pasos realizados para limpiar, transformar y depurar el dataset de eventos. Se explican las decisiones tomadas y se justifica cada transformación aplicada para asegurar la calidad y consistencia de los datos.

In [1]:
# 1. Importar librerías necesarias
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
# 2. Cargar datos de eventos
print("Cargando datos de eventos...")
events_df = pd.read_csv('../data/raw/raw_events.csv')
print(f"Eventos cargados: {len(events_df)}")
print(f"Columnas: {list(events_df.columns)}")
events_df.head()

Cargando datos de eventos...
Eventos cargados: 746
Columnas: ['event_id', 'name', 'date', 'location']


Unnamed: 0,event_id,name,date,location
0,5efaaf313b652dd7,UFC Fight Night: Lopes vs. Silva,"September 13, 2025","San Antonio, Texas, USA"
1,6e380a4d73ab4f0e,UFC Fight Night: Imavov vs. Borralho,"September 06, 2025","Paris, Ile-de-France, France"
2,754968e325d6f60d,UFC Fight Night: Walker vs. Zhang,"August 23, 2025","Shanghai, Hebei, China"
3,421ccfc6ddb17958,UFC 319: Du Plessis vs. Chimaev,"August 16, 2025","Chicago, Illinois, USA"
4,6cd3dfc54f01287f,UFC Fight Night: Dolidze vs. Hernandez,"August 09, 2025","Las Vegas, Nevada, USA"


In [3]:
# --- Cargar también los eventos próximos (upcoming) ---
try:
    upcoming_df = pd.read_csv('../data/raw/raw_upcoming.csv')
    print(f"Eventos próximos cargados: {len(upcoming_df)}")
    print(f"Columnas: {list(upcoming_df.columns)}")
    upcoming_df.head()
except FileNotFoundError:
    print("No se encontró raw_upcoming.csv. Solo se limpiarán eventos completados.")
    upcoming_df = None

Eventos próximos cargados: 9
Columnas: ['event_id', 'name', 'date', 'location']


In [4]:
# 3. Conversión de fechas a datetime
print("Convirtiendo columna 'date' a datetime...")
events_df['date'] = pd.to_datetime(events_df['date'], errors='coerce')
print(f"Fechas no convertidas (NaT): {events_df['date'].isna().sum()}")
events_df.head()

Convirtiendo columna 'date' a datetime...
Fechas no convertidas (NaT): 0


Unnamed: 0,event_id,name,date,location
0,5efaaf313b652dd7,UFC Fight Night: Lopes vs. Silva,2025-09-13,"San Antonio, Texas, USA"
1,6e380a4d73ab4f0e,UFC Fight Night: Imavov vs. Borralho,2025-09-06,"Paris, Ile-de-France, France"
2,754968e325d6f60d,UFC Fight Night: Walker vs. Zhang,2025-08-23,"Shanghai, Hebei, China"
3,421ccfc6ddb17958,UFC 319: Du Plessis vs. Chimaev,2025-08-16,"Chicago, Illinois, USA"
4,6cd3dfc54f01287f,UFC Fight Night: Dolidze vs. Hernandez,2025-08-09,"Las Vegas, Nevada, USA"


In [5]:
# 4. Normalización de nombres y ubicaciones
# Limpiar espacios y capitalizar nombres y ubicaciones
def clean_text(val):
    if pd.isna(val):
        return val
    val = str(val).strip()
    val = ' '.join(val.split())
    return val.title()

events_df['name'] = events_df['name'].apply(clean_text)
events_df['location'] = events_df['location'].apply(clean_text)

# Extraer país de la columna location (después del último ',')
def extract_country(location):
    if pd.isna(location):
        return np.nan
    parts = location.split(',')
    if len(parts) < 2:
        return np.nan
    return parts[-1].strip()

events_df['country'] = events_df['location'].apply(extract_country)

events_df.head()

Unnamed: 0,event_id,name,date,location,country
0,5efaaf313b652dd7,Ufc Fight Night: Lopes Vs. Silva,2025-09-13,"San Antonio, Texas, Usa",Usa
1,6e380a4d73ab4f0e,Ufc Fight Night: Imavov Vs. Borralho,2025-09-06,"Paris, Ile-De-France, France",France
2,754968e325d6f60d,Ufc Fight Night: Walker Vs. Zhang,2025-08-23,"Shanghai, Hebei, China",China
3,421ccfc6ddb17958,Ufc 319: Du Plessis Vs. Chimaev,2025-08-16,"Chicago, Illinois, Usa",Usa
4,6cd3dfc54f01287f,Ufc Fight Night: Dolidze Vs. Hernandez,2025-08-09,"Las Vegas, Nevada, Usa",Usa


In [6]:
# 5. Validación y limpieza de valores nulos
print("Valores nulos por columna:")
print(events_df[['event_id', 'name', 'date']].isnull().sum())

# Eliminar eventos sin id, nombre o fecha
initial_count = len(events_df)
events_df = events_df.dropna(subset=['event_id', 'name', 'date'])
final_count = len(events_df)
print(f"Eventos eliminados por nulos críticos: {initial_count - final_count}")
events_df.head()

Valores nulos por columna:
event_id    0
name        0
date        0
dtype: int64
Eventos eliminados por nulos críticos: 0


Unnamed: 0,event_id,name,date,location,country
0,5efaaf313b652dd7,Ufc Fight Night: Lopes Vs. Silva,2025-09-13,"San Antonio, Texas, Usa",Usa
1,6e380a4d73ab4f0e,Ufc Fight Night: Imavov Vs. Borralho,2025-09-06,"Paris, Ile-De-France, France",France
2,754968e325d6f60d,Ufc Fight Night: Walker Vs. Zhang,2025-08-23,"Shanghai, Hebei, China",China
3,421ccfc6ddb17958,Ufc 319: Du Plessis Vs. Chimaev,2025-08-16,"Chicago, Illinois, Usa",Usa
4,6cd3dfc54f01287f,Ufc Fight Night: Dolidze Vs. Hernandez,2025-08-09,"Las Vegas, Nevada, Usa",Usa


In [7]:
# 6. Conversión de columnas categóricas
for col in ['location', 'country']:
    if col in events_df.columns:
        events_df[col] = events_df[col].astype('category')
        
events_df['name'] = events_df['name'].astype('string')
# Establecer 'event_id' como índice antes de guardar
if 'event_id' in events_df.columns:
    events_df.set_index('event_id', inplace=True)
events_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 746 entries, 5efaaf313b652dd7 to a6a9ab5a824e8f66
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   name      746 non-null    string        
 1   date      746 non-null    datetime64[ns]
 2   location  746 non-null    category      
 3   country   746 non-null    category      
dtypes: category(2), datetime64[ns](1), string(1)
memory usage: 26.4+ KB


In [8]:
# 7. Guardar dataset limpio
events_df.to_csv('../data/processed/events.csv')

In [9]:
# --- Guardar dataset limpio de eventos próximos (upcoming) ---
if upcoming_df is not None:
    # Repetir limpieza básica para upcoming_df
    upcoming_df['date'] = pd.to_datetime(upcoming_df['date'], errors='coerce')
    for col in ['location']:
        if col in upcoming_df.columns:
            upcoming_df[col] = upcoming_df[col].astype('category')
    upcoming_df['name'] = upcoming_df['name'].astype('string')
    if 'event_id' in upcoming_df.columns:
        upcoming_df.set_index('event_id', inplace=True)
    # Eliminar eventos sin id, nombre o fecha
    upcoming_df = upcoming_df.dropna(subset=['name', 'date'])
    # Añadir columna winner_id vacía
    upcoming_df['winner_id'] = ''
    # Guardar
    upcoming_df.to_csv('../data/processed/upcoming.csv')

In [10]:
# 8. Guardar dataset para ML (columnas categóricas como int, sin nulos críticos)
ml_events_df = events_df.copy()
ml_upcoming_df = upcoming_df.copy()

# Convertir columnas categóricas a códigos numéricos para ML
for col in ['location', 'country']:
    if col in ml_events_df.columns and ml_events_df[col].dtype.name == 'category':
        ml_events_df[col] = ml_events_df[col].cat.codes.astype(int)

# Eliminar filas con nulos críticos para ML
ml_events_df = ml_events_df.dropna(subset=['name', 'date'])
ml_upcoming_df = ml_upcoming_df.dropna(subset=['name', 'date'])

# Guardar
ml_events_df.to_csv('../data/ml/events.csv')
ml_upcoming_df.to_csv('../data/ml/upcoming.csv')