# Análisis Exploratorio y Preparación de Datos - F1

In [2]:
import pandas as pd
import numpy as np

In [None]:
#configurar para mostrar las tablas completas y no cortadas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

# Cargar el archivo CSV
df = pd.read_csv("../dataset/F1_Datos.csv", encoding='utf-8', low_memory=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26759 entries, 0 to 26758
Data columns (total 55 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   raceId           26759 non-null  int64  
 1   year             26759 non-null  int64  
 2   round            26759 non-null  int64  
 3   circuitId        26759 non-null  int64  
 4   name_x           26759 non-null  object 
 5   date             26759 non-null  object 
 6   time_x           8290 non-null   object 
 7   url_x_x          26759 non-null  object 
 8   fp1_date         1799 non-null   object 
 9   fp1_time         1359 non-null   object 
 10  fp2_date         1799 non-null   object 
 11  fp2_time         1359 non-null   object 
 12  fp3_date         1439 non-null   object 
 13  fp3_time         1059 non-null   object 
 14  quali_date       1799 non-null   object 
 15  quali_time       1359 non-null   object 
 16  sprint_date      360 non-null    object 
 17  sprint_time 

In [4]:
# Ver valores nulos por columna
# Paso 1: Verificar columnas con más del 90% de valores nulos
null_percent = df.isnull().mean().sort_values(ascending=False) * 100
null_summary = pd.DataFrame({
    'Column': null_percent.index,
    'Percent_Null': null_percent.values,
    'Data_Type': df.dtypes[null_percent.index].values
})
print(null_summary[null_summary['Percent_Null'] > 90].sort_values(by='Percent_Null', ascending=False))

        Column  Percent_Null Data_Type
0  sprint_time     98.878882    object
1  sprint_date     98.654658    object
2     fp3_time     96.042453    object
3     fp1_time     94.921335    object
4     fp2_time     94.921335    object
5   quali_time     94.921335    object
6     fp3_date     94.622370    object
7     fp2_date     93.277028    object
8   quali_date     93.277028    object
9     fp1_date     93.277028    object


In [5]:
# Paso 2: Definir columnas a eliminar
cols_to_drop= [
    'url_x_x', 'url_y_x', 'url_x_y', 'url_y_y',  # URLs
    'sprint_time', 'sprint_date',
    'fp3_time', 'fp3_date', # Tiempos de prácticas
    'fp2_time', 'fp2_date',
    'fp1_time', 'fp1_date', 
    'quali_time', 'quali_date', # Tiempos de clasificación
    ]

# Paso 3: Eliminar columnas
df_cleaned = df.drop(columns=cols_to_drop)

# limpiar los pilotos que nunca largaron
df_cleaned = df_cleaned[df_cleaned['grid'] > 0]

# Mostrar resumen
print(f"Columnas eliminadas: {len(cols_to_drop)}")
print(f"Columnas restantes: {df_cleaned.shape[1]}")
print("Ejemplo de columnas restantes:", df_cleaned.columns.tolist())

Columnas eliminadas: 14
Columnas restantes: 41
Ejemplo de columnas restantes: ['raceId', 'year', 'round', 'circuitId', 'name_x', 'date', 'time_x', 'circuitRef', 'name_y', 'location', 'country', 'lat', 'lng', 'alt', 'resultId', 'driverId', 'constructorId', 'number_x', 'grid', 'position', 'positionText', 'positionOrder', 'points', 'laps', 'time_y', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId', 'constructorRef', 'name', 'nationality_x', 'driverRef', 'number_y', 'code', 'forename', 'surname', 'dob', 'nationality_y']


In [6]:
# --- Normalizacion ---

# 1. Normalizar fecha y hora de la carrera
df_cleaned['race_datetime'] = pd.to_datetime(
    df_cleaned['date'] + ' ' + df_cleaned['time_x'], 
    errors='coerce'
)

# 2. Normalizar fecha y hora de finalización de la carrera
df_cleaned['result_datetime'] = pd.to_datetime(
    df_cleaned['date'] + ' ' + df_cleaned['time_y'], 
    errors='coerce'
)

# 3. Extraer componentes temporales de race_datetime
df_cleaned['race_year']    = df_cleaned['race_datetime'].dt.year
df_cleaned['race_month']   = df_cleaned['race_datetime'].dt.month
df_cleaned['race_day']     = df_cleaned['race_datetime'].dt.day
df_cleaned['race_weekday'] = df_cleaned['race_datetime'].dt.day_name()
df_cleaned['race_hour']    = df_cleaned['race_datetime'].dt.hour

#birth_date del piloto al momento de la carrera a partir de dob
df_cleaned['birth_date'] = pd.to_datetime(df_cleaned['dob'], errors='coerce')
# Calcular la edad del piloto al momento de la carrera
df_cleaned['driver_age'] = (df_cleaned['race_datetime'] - df_cleaned['birth_date']).dt.days // 365


# 4. Función para convertir "M:SS.sss" → segundos
def parse_lap_time(t):
    try:
        m, s = t.split(':')
        return int(m) * 60 + float(s)
    except:
        return np.nan

# 5. Aplicar a la columna fastestLapTime
#    (asegúrate de que 'fastestLapTime' exista en df_cleaned)
df_cleaned['fastestLapTime_sec'] = df_cleaned['fastestLapTime'].apply(parse_lap_time)

# 6. Verificación rápida de las nuevas columnas
print(df_cleaned[[
    'date','time_x','race_datetime',
    'time_y','result_datetime',
    'race_year','race_month','race_day','race_weekday','race_hour',
    'fastestLapTime','fastestLapTime_sec'
]].head(5))

         date    time_x       race_datetime       time_y         result_datetime  race_year  race_month  race_day race_weekday  race_hour fastestLapTime  fastestLapTime_sec
0  2009-03-29  06:00:00 2009-03-29 06:00:00  1:34:15.784 2009-03-29 01:34:15.784     2009.0         3.0      29.0       Sunday        6.0       1:28.020              88.020
1  2009-03-29  06:00:00 2009-03-29 06:00:00       +0.807                     NaT     2009.0         3.0      29.0       Sunday        6.0       1:29.066              89.066
2  2009-03-29  06:00:00 2009-03-29 06:00:00       +1.604                     NaT     2009.0         3.0      29.0       Sunday        6.0       1:28.916              88.916
3  2009-03-29  06:00:00 2009-03-29 06:00:00       +4.435                     NaT     2009.0         3.0      29.0       Sunday        6.0       1:28.416              88.416
4  2009-03-29  06:00:00 2009-03-29 06:00:00       +4.879                     NaT     2009.0         3.0      29.0       Sunday        6

🕓 Fechas y Horas
| Columna         | Tipo         | ¿Qué representa?                                            | ¿Para qué sirve?                                                                 |
|-----------------|--------------|-------------------------------------------------------------|----------------------------------------------------------------------------------|
| `race_datetime` | datetime64   | Fecha y hora completa del inicio de la carrera              | Para ordenar eventos cronológicamente, calcular tiempos entre carreras, etc.    |
| `result_datetime` | datetime64 | Fecha y hora estimada de finalización del resultado         | Permite estimar duración de carrera o detectar anomalías si se compara con inicio |

📆 Componentes Temporales
| Columna         | Tipo     | ¿Qué representa?                         | ¿Para qué sirve?                                                                 |
|-----------------|----------|------------------------------------------|----------------------------------------------------------------------------------|
| `race_year`     | int      | Año en que se corrió la carrera          | Agrupar por año, o excluirlo para evitar sesgos históricos                      |
| `race_month`    | int      | Mes de la carrera (1 a 12)               | Analizar patrones estacionales de rendimiento o DNF                            |
| `race_day`      | int      | Día del mes de la carrera                | Menos útil por sí solo, pero útil combinado con otros                         |
| `race_weekday`  | string   | Día de la semana (ej: "Sunday")          | Útil para ver diferencias entre carreras en sábado o domingo                   |
| `race_hour`     | int      | Hora local de inicio de la carrera (0–23)| Analizar influencia del horario en el rendimiento                              |

⏱️ Tiempos de Vuelta Rápida
| Columna              | Tipo     | ¿Qué representa?                                      | ¿Para qué sirve?                                                                 |
|----------------------|----------|-------------------------------------------------------|----------------------------------------------------------------------------------|
| `fastestLapTime`     | string   | Tiempo de vuelta rápida como texto (ej: "1:22.345")  | No usable directamente sin convertirlo                                          |
| `fastestLapTime_sec` | float64  | Tiempo de vuelta rápida en segundos (ej: 82.345)     | Ideal para análisis cuantitativo y comparaciones entre pilotos o circuitos      |


In [7]:
import pandas as pd

# Definir que cuando 'position' es NaN, es un DNF
df_cleaned['DNF'] = df_cleaned['position'].isna().astype(int)

# Etiqueta de victoria
df_cleaned['win'] = (df_cleaned['positionOrder'] == 1).astype(int)

# Definir el sistema de puntos actual
current_scoring = {
    1: 25, 2: 18, 3: 15, 4: 12, 5: 10,
    6: 8, 7: 6, 8: 4, 9: 2, 10: 1
}

# Clasificar circuitos -> street, permanent, mixed
circuit_types = {
    'street': ['monaco', 'marina_bay', 'baku', 'miami', 'vegas', 'jeddah',
               'phoenix', 'long_beach', 'detroit', 'dallas', 'las_vegas',
               'montjuic', 'ain-diab', 'boavista', 'monsanto', 'sebring',
               'pescara', 'pedralbes', 'reims'],
    'permanent': ['silverstone', 'monza', 'spa', 'suzuka', 'catalunya', 'sepang',
                  'shanghai', 'bahrain', 'interlagos', 'villeneuve', 'hockenheimring',
                  'fuji', 'indianapolis', 'imola', 'red_bull_ring', 'galvez',
                  'jerez', 'estoril', 'okayama', 'kyalami', 'ricard', 'yeongam',
                  'brands_hatch', 'zandvoort', 'zolder', 'dijon', 'jarama',
                  'watkins_glen', 'anderstorp', 'mosport', 'tremblant', 'essarts',
                  'george', 'zeltweg', 'aintree', 'riverside', 'buddh', 'americas',
                  'sochi', 'mugello', 'portimao', 'losail'],
    'mixed': ['albert_park', 'valencia', 'yas_marina', 'nurburgring', 'istanbul',
              'magny_cours', 'donington', 'nivelles', 'charade', 'lemans',
              'avus', 'bremgarten']
}

# Clasificación por tipo de circuito
def classify_circuit(circuit_name):
    circuit_name = circuit_name.lower()
    for circuit_type, names in circuit_types.items():
        if any(name in circuit_name for name in names):
            return circuit_type
    return 'other'

df_cleaned['circuit_type'] = df_cleaned['circuitRef'].apply(classify_circuit)

# Mapear posición a puntos reales
df_cleaned['real_points'] = (
    df_cleaned['positionOrder'].map(current_scoring).fillna(0).astype(int)
)

# Cambio de posición
if 'positionOrder' in df_cleaned.columns:
    df_cleaned['position_change'] = (
        df_cleaned['grid'] - df_cleaned['positionOrder']
    ).where(df_cleaned['positionOrder'].notna())
else:
    raise ValueError("La columna 'positionOrder' no existe en df_cleaned")

# Gap con el líder
df_cleaned['gap_to_leader'] = (
    df_cleaned['milliseconds'] - df_cleaned.groupby('raceId')['milliseconds'].transform('min')
)

# Gap con compañero de equipo
df_cleaned['gap_to_team'] = (
    df_cleaned['milliseconds'] - df_cleaned.groupby(['raceId', 'constructorRef'])['milliseconds'].transform('min')
)

# Estadísticas por piloto
driver_stats = (
    df_cleaned
    .groupby('driverRef')
    .agg(
        avg_gap_to_team     = ('gap_to_team', 'mean'),
        avg_gap_to_leader   = ('gap_to_leader', 'mean'),
        avg_points          = ('points', 'mean'),
        total_points        = ('points', 'sum'),
        avg_real_points     = ('real_points', 'mean'),
        total_real_points   = ('real_points', 'sum'),
        races               = ('points', 'count'),
        avg_position_change = ('position_change', 'mean'),
        avg_position        = ('position', 'mean'),
        avg_grid            = ('grid', 'mean'),
        wins                = ('win', 'sum'),
        dnfs                = ('DNF', 'sum'),
    )
    .reset_index()
)

# Tasas
driver_stats['dnf_rate'] = driver_stats['dnfs'] / driver_stats['races']
driver_stats['win_rate'] = driver_stats['wins'] / driver_stats['races']

# Agrupar por piloto y circuito
grp = df_cleaned.groupby(['driverRef', 'circuitRef'])

# Carreras previas
df_cleaned['circuit_prev_races'] = grp.cumcount()

# Usar transform en lugar de apply para evitar errores de índice
df_cleaned['circuit_prev_dnfs'] = grp['DNF'].transform(lambda x: x.shift().expanding().sum())
df_cleaned['circuit_prev_wins'] = grp['win'].transform(lambda x: x.shift().expanding().sum())
df_cleaned['circuit_prev_avg_pos'] = grp['positionOrder'].transform(lambda x: x.shift().expanding().mean())
df_cleaned['circuit_prev_avg_points'] = grp['real_points'].transform(lambda x: x.shift().expanding().mean())

# Tasa sobre carreras previas
df_cleaned['circuit_prev_dnf_rate'] = df_cleaned['circuit_prev_dnfs'] / df_cleaned['circuit_prev_races'].replace(0, pd.NA)
df_cleaned['circuit_prev_win_rate'] = df_cleaned['circuit_prev_wins'] / df_cleaned['circuit_prev_races'].replace(0, pd.NA)

#Crear columnas para las estadisticas de las ultimas 5 carreras (o menos si no hay suficientes)

def last_n_races(df, n=5):
    # Ordenar por driver y año/carrera para asegurar el orden cronológico
    df_sorted = df.sort_values(['driverRef', 'year', 'raceId'])
    # Tomar las últimas n carreras para cada piloto
    return df_sorted.groupby('driverRef').tail(n)

def add_last_n_race_stats(df, n=5):
    # Filtrar las últimas n carreras
    last_n = last_n_races(df, n)
    # Calcular estadísticas
    stats = last_n.groupby('driverRef').agg(
        last_n_avg_position=('positionOrder', 'mean'),
        last_n_total_points=('real_points', 'sum'),
        last_n_wins=('win', 'sum'),
        last_n_dnfs=('DNF', 'sum'),
        last_n_avg_gap_to_team=('gap_to_team', 'mean'),
        last_n_avg_gap_to_leader=('gap_to_leader', 'mean'),
        last_n_avg_position_change=('position_change', 'mean'),
        last_n_avg_grid=('grid', 'mean'),
    ).reset_index()
    # Unir estadísticas al DataFrame original
    df = df.merge(stats, on='driverRef', how='left')
    return df
df_cleaned = add_last_n_race_stats(df_cleaned)

# Unir estadísticas de pilotos al DataFrame principal
df_cleaned = df_cleaned.merge(driver_stats, on='driverRef', how='left')



In [8]:
import pandas as pd
from meteostat import Point, Hourly
from datetime import timedelta

# --- FUNCIONES ---

def fetch_weather(row):
    """
    Dada una fila con 'lat', 'lng', 'alt' y 'race_datetime', devuelve una Serie con datos climáticos.
    Se considera una ventana de -1 a +3 horas desde el inicio de la carrera.
    """
    location = Point(row['lat'], row['lng'], row['alt'])
    start = row['race_datetime'] - timedelta(hours=3)
    end = row['race_datetime'] + timedelta(hours=3)

    try:
        data = Hourly(location, start, end)
        data = data.fetch()
    except Exception:
        return pd.Series({
            'temperature_C': pd.NA,
            'precipitation_mm': pd.NA,
            'wind_speed_kmh': pd.NA,
            'humidity_pct': pd.NA,
            'isRainy': pd.NA
        })

    if data.empty:
        return pd.Series({
            'temperature_C': pd.NA,
            'precipitation_mm': pd.NA,
            'wind_speed_kmh': pd.NA,
            'humidity_pct': pd.NA,
            'isRainy': pd.NA
        })

    # Promediar datos de la ventana
    temp = data['temp'].mean()
    prcp = data['prcp'].fillna(0).sum()  # Suma total de precipitación en la ventana
    wspd = data['wspd'].mean()
    rhum = data['rhum'].mean()

    return pd.Series({
        'temperature_C': temp,
        'precipitation_mm': prcp,
        'wind_speed_kmh': wspd,
        'humidity_pct': rhum,
        'isRainy': prcp > 0
    })


def get_weather_per_race(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extrae una fila por carrera (raceId) y le agrega datos meteorológicos.
    """
    race_cols = ['raceId', 'race_datetime', 'lat', 'lng', 'alt']
    df_races = df[race_cols].drop_duplicates(subset='raceId').reset_index(drop=True)
    weather_data = df_races.apply(fetch_weather, axis=1)
    df_weather = pd.concat([df_races[['raceId']], weather_data], axis=1)
    return df_weather


def add_weather_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Agrega columnas climáticas por carrera al dataset original.
    """
    df_weather = get_weather_per_race(df)
    df_merged = df.merge(df_weather, on='raceId', how='left')
    return df_merged

# --- USO ---

# Aplicar al dataframe original
df_cleaned = add_weather_columns(df_cleaned)




In [9]:
#imprimir cantidad de carreras lluviosas
print("Cantidad de carreras lluviosas:", df_cleaned['isRainy'].sum())
print("Cantidad de carreras no lluviosas:", (~df_cleaned['isRainy']).sum())

print(f"Cantidad de columnas restantes: {df_cleaned.shape[1]}")
print("columnas restantes:", df_cleaned.columns.tolist())

# mostrar tabla con todas las estadísticas de los pilotos para una carrera N
def display_driver_stats(df, race_id):
    # Filtrar datos para la carrera específica
    race_data = df[df['raceId'] == race_id]
    # Mostrar tabla con estadísticas
    display(race_data)

#mostrar carrera con id 1141
display_driver_stats(df_cleaned, 1141)


Cantidad de carreras lluviosas: 673
Cantidad de carreras no lluviosas: 6694
Cantidad de columnas restantes: 92
columnas restantes: ['raceId', 'year', 'round', 'circuitId', 'name_x', 'date', 'time_x', 'circuitRef', 'name_y', 'location', 'country', 'lat', 'lng', 'alt', 'resultId', 'driverId', 'constructorId', 'number_x', 'grid', 'position', 'positionText', 'positionOrder', 'points', 'laps', 'time_y', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId', 'constructorRef', 'name', 'nationality_x', 'driverRef', 'number_y', 'code', 'forename', 'surname', 'dob', 'nationality_y', 'race_datetime', 'result_datetime', 'race_year', 'race_month', 'race_day', 'race_weekday', 'race_hour', 'birth_date', 'driver_age', 'fastestLapTime_sec', 'DNF', 'win', 'circuit_type', 'real_points', 'position_change', 'gap_to_leader', 'gap_to_team', 'circuit_prev_races', 'circuit_prev_dnfs', 'circuit_prev_wins', 'circuit_prev_avg_pos', 'circuit_prev_avg_points', 'circuit_prev_dnf_rate'

Unnamed: 0,raceId,year,round,circuitId,name_x,date,time_x,circuitRef,name_y,location,country,lat,lng,alt,resultId,driverId,constructorId,number_x,grid,position,positionText,positionOrder,points,laps,time_y,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,constructorRef,name,nationality_x,driverRef,number_y,code,forename,surname,dob,nationality_y,race_datetime,result_datetime,race_year,race_month,race_day,race_weekday,race_hour,birth_date,driver_age,fastestLapTime_sec,DNF,win,circuit_type,real_points,position_change,gap_to_leader,gap_to_team,circuit_prev_races,circuit_prev_dnfs,circuit_prev_wins,circuit_prev_avg_pos,circuit_prev_avg_points,circuit_prev_dnf_rate,circuit_prev_win_rate,last_n_avg_position,last_n_total_points,last_n_wins,last_n_dnfs,last_n_avg_gap_to_team,last_n_avg_gap_to_leader,last_n_avg_position_change,last_n_avg_grid,avg_gap_to_team,avg_gap_to_leader,avg_points,total_points,avg_real_points,total_real_points,races,avg_position_change,avg_position,avg_grid,wins,dnfs,dnf_rate,win_rate,temperature_C,precipitation_mm,wind_speed_kmh,humidity_pct,isRainy
25043,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26685,830,9,1.0,17,1.0,1,1,26.0,69,2:06:54.430,7614430.0,67.0,1.0,1:20.472,192.767,1,red_bull,Red Bull,Austrian,max_verstappen,33.0,VER,Max,Verstappen,1997-09-30,Dutch,2024-11-03 17:00:00,2024-11-03 02:06:54.430,2024.0,11.0,3.0,Sunday,17.0,1997-09-30,27.0,80.472,0,1,permanent,25,16,0.0,0.0,8,0.0,2.0,3.625,15.125,0.0,0.25,3.8,76,2,0,0.0,25197.4,2.2,6.0,2294.0,18122.252941,14.002404,2912.5,13.927885,2897,208,-0.605769,3.547486,4.980769,63,29,0.139423,0.302885,21.528571,4.4,11.971429,85.285714,True
25044,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26686,839,214,31.0,4,2.0,2,2,18.0,69,+19.477,7633907.0,64.0,7.0,1:21.771,189.705,1,alpine,Alpine F1 Team,French,ocon,31.0,OCO,Esteban,Ocon,1996-09-17,French,2024-11-03 17:00:00,NaT,2024.0,11.0,3.0,Sunday,17.0,1996-09-17,28.0,81.771,0,0,permanent,18,2,19477.0,0.0,6,1.0,0.0,11.833333,1.5,0.166667,0.0,14.0,18,0,1,0.0,19477.0,-1.6,12.4,4641.150685,49748.808219,2.894737,440.0,2.914474,443,152,0.335526,9.851562,11.5,1,24,0.157895,0.006579,21.528571,4.4,11.971429,85.285714,True
25045,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26687,842,214,10.0,13,3.0,3,3,15.0,69,+22.532,7636962.0,66.0,5.0,1:21.645,189.998,1,alpine,Alpine F1 Team,French,gasly,10.0,GAS,Pierre,Gasly,1996-02-07,French,2024-11-03 17:00:00,NaT,2024.0,11.0,3.0,Sunday,17.0,1996-02-07,28.0,81.645,0,0,permanent,15,10,22532.0,3055.0,6,0.0,0.0,9.166667,5.0,0.0,0.0,9.0,32,0,1,1018.333333,37291.333333,-1.0,8.0,4384.91358,54353.382716,2.845638,424.0,2.852349,425,149,-0.060403,10.083333,11.060403,1,17,0.114094,0.006711,21.528571,4.4,11.971429,85.285714,True
25046,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26688,847,131,63.0,2,4.0,4,4,12.0,69,+23.265,7637695.0,66.0,6.0,1:21.645,189.998,1,mercedes,Mercedes,German,russell,63.0,RUS,George,Russell,1998-02-15,British,2024-11-03 17:00:00,NaT,2024.0,11.0,3.0,Sunday,17.0,1998-02-15,26.0,81.645,0,0,permanent,12,-2,23265.0,0.0,4,1.0,1.0,10.25,6.25,0.25,0.25,3.8,69,1,0,962.2,24688.6,-0.8,3.0,74132.128571,102179.028571,5.312,664.0,5.328,666,125,-0.248,9.185185,10.152,3,17,0.136,0.024,21.528571,4.4,11.971429,85.285714,True
25047,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26689,844,6,16.0,6,5.0,5,5,10.0,69,+30.177,7644607.0,60.0,4.0,1:21.631,190.03,1,ferrari,Ferrari,Italian,leclerc,16.0,LEC,Charles,Leclerc,1997-10-16,Monegasque,2024-11-03 17:00:00,NaT,2024.0,11.0,3.0,Sunday,17.0,1997-10-16,27.0,81.631,0,0,permanent,10,1,30177.0,0.0,4,0.0,0.0,8.5,7.0,0.0,0.0,3.4,70,0,0,12572.0,23361.2,4.2,7.6,4510.722222,30616.583333,9.209459,1363.0,9.155405,1355,148,-0.986486,5.484127,6.486486,8,22,0.148649,0.054054,21.528571,4.4,11.971429,85.285714,True
25048,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26690,846,1,4.0,1,6.0,6,6,8.0,69,+31.372,7645802.0,67.0,2.0,1:21.517,190.296,1,mclaren,McLaren,British,norris,4.0,NOR,Lando,Norris,1999-11-13,British,2024-11-03 17:00:00,NaT,2024.0,11.0,3.0,Sunday,17.0,1999-11-13,24.0,81.517,0,0,permanent,8,-5,31372.0,0.0,4,1.0,0.0,9.5,5.75,0.25,0.0,5.0,60,1,0,5788.6,23044.8,-2.2,2.8,3094.882979,35987.702128,7.421875,950.0,7.328125,938,128,-0.59375,7.05042,7.257812,4,9,0.070312,0.03125,21.528571,4.4,11.971429,85.285714,True
25049,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26691,852,215,22.0,3,7.0,7,7,6.0,69,+42.056,7656486.0,69.0,8.0,1:21.828,189.573,1,rb,RB F1 Team,Italian,tsunoda,22.0,TSU,Yuki,Tsunoda,2000-05-11,Japanese,2024-11-03 17:00:00,NaT,2024.0,11.0,3.0,Sunday,17.0,2000-05-11,24.0,81.828,0,0,permanent,6,-4,42056.0,0.0,2,0.0,0.0,12.0,1.0,0.0,0.0,12.2,8,0,1,0.0,55321.333333,-3.0,9.2,4043.243243,59640.891892,1.011628,87.0,1.0,86,86,-0.72093,12.183099,12.697674,0,15,0.174419,0.0,21.528571,4.4,11.971429,85.285714,True
25050,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26692,857,1,81.0,8,8.0,8,8,4.0,69,+44.943,7659373.0,69.0,3.0,1:21.532,190.261,1,mclaren,McLaren,British,piastri,81.0,PIA,Oscar,Piastri,2001-04-06,Australian,2024-11-03 17:00:00,NaT,2024.0,11.0,3.0,Sunday,17.0,2001-04-06,23.0,81.532,0,0,permanent,4,0,44943.0,13571.0,1,0.0,0.0,14.0,0.0,0.0,0.0,7.2,30,0,0,33119.0,50375.2,0.6,7.8,15431.078947,34016.052632,7.543478,347.0,7.521739,346,46,-0.456522,7.116279,7.434783,2,3,0.065217,0.043478,21.528571,4.4,11.971429,85.285714,True
25051,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26693,859,215,30.0,5,9.0,9,9,2.0,69,+50.452,7664882.0,67.0,10.0,1:22.123,188.892,1,rb,RB F1 Team,Italian,lawson,30.0,LAW,Liam,Lawson,2002-02-11,New Zealander,2024-11-03 17:00:00,NaT,2024.0,11.0,3.0,Sunday,17.0,2002-02-11,22.0,82.123,0,0,permanent,2,-4,50452.0,8396.0,0,,,,,,,14.4,2,0,0,12716.333333,68037.666667,-2.4,12.0,5449.857143,62482.714286,0.545455,6.0,0.545455,6,11,0.545455,12.909091,13.454545,0,0,0.0,0.0,21.528571,4.4,11.971429,85.285714,True
25052,1141,2024,21,18,São Paulo Grand Prix,2024-11-03,17:00:00,interlagos,Autódromo José Carlos Pace,São Paulo,Brazil,-23.7036,-46.6997,785,26694,1,131,44.0,14,10.0,10,10,1.0,69,+50.753,7665183.0,69.0,9.0,1:22.041,189.081,1,mercedes,Mercedes,German,hamilton,44.0,HAM,Lewis,Hamilton,1985-01-07,British,2024-11-03 17:00:00,NaT,2024.0,11.0,3.0,Sunday,17.0,1985-01-07,39.0,82.041,0,0,permanent,1,4,50753.0,27488.0,16,2.0,3.0,6.125,12.25,0.125,0.1875,6.4,43,0,0,15363.8,39090.2,4.0,10.4,5595.488746,17289.504823,13.573239,4818.5,14.515493,5153,355,-0.695775,3.627692,4.312676,105,30,0.084507,0.295775,21.528571,4.4,11.971429,85.285714,True


In [10]:
# Guardar el dataset limpio
df_cleaned.to_csv('../dataset/F1_Datos_limpios.csv', index=False)

Notebook creada el 2025-05-22 para análisis exploratorio y generación de nuevas variables predictoras.