📌 Plan de acción

✅ Paso 1: Preprocesar los datos (convertir variables categóricas con Label Encoding, normalizar si es necesario).

✅ Paso 2: Separar datos en 70% entrenamiento y 30% prueba.

✅ Paso 3: Entrenar Regresión Logística y XGBoost.

✅ Paso 4: Evaluar cuál tiene mejor precisión.

✅ Paso 5: Probar con un partido real (ejemplo: Nacional vs. Once Caldas).

In [5]:
import pandas as pd
# Cargar el archivo CSV en un DataFrame
df = pd.read_csv("partidos_colombia.csv")
df

Unnamed: 0,Fecha,Local,Visitante,Marcador,Goles Local,Goles Visitante,Tarjetas Rojas Local,Tarjetas Rojas Visitante,Resultado Local,Resultado Visitante
0,26-01-2025,Alianza Petrolera,Independiente Medellín,1 - 1,1,1,0,0,Empate,Empate
1,02-02-2025,Deportivo Pereira,Alianza Petrolera,0 - 0,0,0,0,0,Empate,Empate
2,07-02-2025,Deportes Tolima,Alianza Petrolera,3 - 1,3,1,0,0,Victoria,Derrota
3,17-02-2025,Atlético Bucaramanga,Alianza Petrolera,0 - 0,0,0,0,0,Empate,Empate
4,22-02-2025,Alianza Petrolera,Atlético Nacional,3 - 2,3,2,0,0,Victoria,Derrota
...,...,...,...,...,...,...,...,...,...,...
1905,12-10-2023,Deportivo Pasto,Union Magdalena,3 - 1,3,1,0,0,Victoria,Derrota
1906,18-10-2023,Millonarios,Union Magdalena,1 - 1,1,1,0,0,Empate,Empate
1907,20-10-2023,Union Magdalena,Bucaramanga,1 - 2,1,2,0,0,Derrota,Victoria
1908,27-10-2023,Deportes Tolima,Union Magdalena,2 - 1,2,1,0,0,Victoria,Derrota


In [6]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Copia del DataFrame original
df_modelo = df.copy()

# ====== Convertir variables categóricas a numéricas ======
label_encoder = LabelEncoder()

df_modelo["Local"] = label_encoder.fit_transform(df_modelo["Local"])
df_modelo["Visitante"] = label_encoder.fit_transform(df_modelo["Visitante"])
df_modelo["Resultado Local"] = label_encoder.fit_transform(df_modelo["Resultado Local"])
df_modelo["Resultado Visitante"] = label_encoder.fit_transform(df_modelo["Resultado Visitante"])

# ====== Normalizar las variables numéricas ======
scaler = MinMaxScaler()
columnas_numericas = ["Goles Local", "Goles Visitante", "Tarjetas Rojas Local", "Tarjetas Rojas Visitante"]
df_modelo[columnas_numericas] = scaler.fit_transform(df_modelo[columnas_numericas])



In [7]:
def calcular_historico(df, equipo_col, goles_favor_col, goles_contra_col, tarjetas_col, resultado_col, n=5):
    df_sorted = df.sort_index()  # Ordenar para evitar problemas con rolling()

    # Crear promedios móviles de goles y tarjetas
    historico = df_sorted.groupby(equipo_col).rolling(n, min_periods=1).agg({
        goles_favor_col: "mean",
        goles_contra_col: "mean",
        tarjetas_col: "mean"
    }).reset_index(drop=True)

    # Detectar victorias correctamente
    df_sorted["Victoria"] = df_sorted[resultado_col].apply(lambda x: 1 if x in ["Victoria", 2] else 0)

    # Calcular el ratio de victorias con rolling()
    df_sorted[f"Ratio_Victorias_{equipo_col}"] = df_sorted.groupby(equipo_col)["Victoria"].transform(lambda x: x.rolling(n, min_periods=1).mean())

    # Renombrar columnas
    historico.rename(columns={
        goles_favor_col: f"Prom_Goles_Favor_{equipo_col}",
        goles_contra_col: f"Prom_Goles_Contra_{equipo_col}",
        tarjetas_col: f"Prom_Tarjetas_{equipo_col}"
    }, inplace=True)

    # Agregar el Ratio de Victorias corregido
    historico[f"Ratio_Victorias_{equipo_col}"] = df_sorted[f"Ratio_Victorias_{equipo_col}"]

    return historico

# Aplicar el cálculo a Local y Visitante
historico_local = calcular_historico(df_modelo, "Local", "Goles Local", "Goles Visitante", "Tarjetas Rojas Local", "Resultado Local")
historico_visitante = calcular_historico(df_modelo, "Visitante", "Goles Visitante", "Goles Local", "Tarjetas Rojas Visitante", "Resultado Visitante")

# Unir los históricos al df_modelo
df_modelo = pd.concat([df_modelo, historico_local, historico_visitante], axis=1)

# Ver los primeros registros
df_modelo

Unnamed: 0,Fecha,Local,Visitante,Marcador,Goles Local,Goles Visitante,Tarjetas Rojas Local,Tarjetas Rojas Visitante,Resultado Local,Resultado Visitante,Prom_Goles_Favor_Local,Prom_Goles_Contra_Local,Prom_Tarjetas_Local,Ratio_Victorias_Local,Prom_Goles_Favor_Visitante,Prom_Goles_Contra_Visitante,Prom_Tarjetas_Visitante,Ratio_Victorias_Visitante
0,26-01-2025,0,17,1 - 1,0.142857,0.142857,0.0,0.0,1,1,0.142857,0.142857,0.0,0.0,0.000000,0.000000,0.0,0.0
1,02-02-2025,12,0,0 - 0,0.000000,0.000000,0.0,0.0,1,1,0.285714,0.214286,0.0,0.0,0.071429,0.214286,0.0,0.0
2,07-02-2025,9,0,3 - 1,0.428571,0.142857,0.0,0.0,2,0,0.238095,0.142857,0.0,1.0,0.047619,0.142857,0.0,0.0
3,17-02-2025,4,0,0 - 0,0.000000,0.000000,0.0,0.0,1,1,0.285714,0.178571,0.0,0.0,0.107143,0.142857,0.0,0.0
4,22-02-2025,0,5,3 - 2,0.428571,0.285714,0.0,0.0,2,0,0.257143,0.171429,0.0,0.5,0.085714,0.114286,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1905,12-10-2023,11,30,3 - 1,0.428571,0.142857,0.0,0.0,2,0,0.107143,0.178571,0.0,0.8,0.071429,0.214286,0.0,0.4
1906,18-10-2023,24,30,1 - 1,0.142857,0.142857,0.0,0.0,1,1,0.085714,0.142857,0.0,0.4,0.057143,0.228571,0.0,0.2
1907,20-10-2023,30,7,1 - 2,0.142857,0.285714,0.0,0.0,0,2,0.028571,0.085714,0.0,0.4,0.085714,0.228571,0.0,0.2
1908,27-10-2023,9,30,2 - 1,0.285714,0.142857,0.0,0.0,2,0,0.057143,0.171429,0.0,0.6,0.085714,0.228571,0.0,0.2


In [8]:
import pandas as pd

# Suponiendo que df_modelo ya está cargado y contiene los datos
df_modelo['Fecha'] = pd.to_datetime(df_modelo['Fecha'])

# Ordenamos el dataset por fecha para asegurar que los partidos más recientes estén al final
df_modelo = df_modelo.sort_values(by='Fecha')

# Función para calcular tasa de victorias en los últimos 5 partidos
def calcular_tasa_victorias(df, equipo_col, rival_col, resultado_col):
    tasas_victoria = []
    
    for index, row in df.iterrows():
        equipo = row[equipo_col]
        rival = row[rival_col]
        fecha = row['Fecha']
        
        # Filtrar los últimos 5 partidos donde el equipo haya jugado (tanto en 2025 como finales de 2024)
        partidos_equipo = df[
            ((df[equipo_col] == equipo) | (df[rival_col] == equipo)) &
            (df['Fecha'] < fecha)
        ].tail(5)
        
        # Filtrar los últimos 5 partidos entre el equipo y ese rival específico
        partidos_vs_rival = df[
            (((df[equipo_col] == equipo) & (df[rival_col] == rival)) |
             ((df[equipo_col] == rival) & (df[rival_col] == equipo))) &
            (df['Fecha'] < fecha)
        ].tail(5)
        
        # Calcular tasa de victorias
        if len(partidos_equipo) > 0:
            tasa_victorias_equipo = (partidos_equipo[partidos_equipo[equipo_col] == equipo][resultado_col] == 2).sum() / len(partidos_equipo)
        else:
            tasa_victorias_equipo = 0  # Si no hay historial, tasa es 0
        
        if len(partidos_vs_rival) > 0:
            tasa_victorias_vs_rival = (partidos_vs_rival[partidos_vs_rival[equipo_col] == equipo][resultado_col] == 2).sum() / len(partidos_vs_rival)
        else:
            tasa_victorias_vs_rival = 0  # Si no hay enfrentamientos previos, tasa es 0
        
        tasas_victoria.append((tasa_victorias_equipo, tasa_victorias_vs_rival))
    
    return zip(*tasas_victoria)

# Aplicamos la función para calcular las tasas de victoria
(df_modelo['Tasa_Victorias_Local'], df_modelo['Tasa_Victorias_VS_Rival_Local']) = calcular_tasa_victorias(df_modelo, 'Local', 'Visitante', 'Resultado Local')
(df_modelo['Tasa_Victorias_Visitante'], df_modelo['Tasa_Victorias_VS_Rival_Visitante']) = calcular_tasa_victorias(df_modelo, 'Visitante', 'Local', 'Resultado Visitante')

# Verificamos que las tasas estén en el rango correcto
df_modelo[['Tasa_Victorias_Local', 'Tasa_Victorias_Visitante', 'Tasa_Victorias_VS_Rival_Local', 'Tasa_Victorias_VS_Rival_Visitante']].describe()


  df_modelo['Fecha'] = pd.to_datetime(df_modelo['Fecha'])


Unnamed: 0,Tasa_Victorias_Local,Tasa_Victorias_Visitante,Tasa_Victorias_VS_Rival_Local,Tasa_Victorias_VS_Rival_Visitante
count,1910.0,1910.0,1910.0,1910.0
mean,0.196021,0.094136,0.059372,0.034136
std,0.211173,0.168707,0.15531,0.125055
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.2,0.0,0.0,0.0
75%,0.4,0.2,0.0,0.0
max,1.0,0.8,1.0,1.0


In [9]:
df_modelo

Unnamed: 0,Fecha,Local,Visitante,Marcador,Goles Local,Goles Visitante,Tarjetas Rojas Local,Tarjetas Rojas Visitante,Resultado Local,Resultado Visitante,...,Prom_Tarjetas_Local,Ratio_Victorias_Local,Prom_Goles_Favor_Visitante,Prom_Goles_Contra_Visitante,Prom_Tarjetas_Visitante,Ratio_Victorias_Visitante,Tasa_Victorias_Local,Tasa_Victorias_VS_Rival_Local,Tasa_Victorias_Visitante,Tasa_Victorias_VS_Rival_Visitante
1870,2023-01-24,30,15,2 - 1,0.285714,0.142857,0.0,0.0,2,0,...,0.0,0.4,0.142857,0.228571,0.000000,0.20,0.0,0.0,0.0,0.0
1161,2023-01-24,7,13,0 - 0,0.000000,0.000000,0.0,0.0,1,1,...,0.0,0.2,0.028571,0.257143,0.000000,0.00,0.0,0.0,0.0,0.0
1465,2023-01-24,30,15,2 - 1,0.285714,0.142857,0.0,0.0,2,0,...,0.0,0.4,0.200000,0.171429,0.000000,0.20,0.0,0.0,0.0,0.0
1598,2023-01-24,27,20,1 - 1,0.142857,0.142857,0.0,0.0,1,1,...,0.0,0.6,0.085714,0.257143,0.000000,0.40,0.0,0.0,0.0,0.0
1425,2023-01-24,7,13,0 - 0,0.000000,0.000000,0.0,0.0,1,1,...,0.0,0.2,0.257143,0.228571,0.000000,0.00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,2025-03-09,5,2,1 - 0,0.142857,0.000000,0.0,0.0,2,0,...,0.0,1.0,0.057143,0.257143,0.000000,0.40,0.0,0.5,0.0,0.0
35,2025-03-09,10,6,0 - 0,0.000000,0.000000,0.0,0.0,1,1,...,0.0,0.0,0.057143,0.171429,0.000000,0.25,0.4,0.5,0.4,0.0
50,2025-03-09,10,6,0 - 0,0.000000,0.000000,0.0,0.0,1,1,...,0.0,0.4,0.028571,0.142857,0.133333,0.20,0.4,0.5,0.4,0.0
42,2025-03-10,9,25,2 - 0,0.285714,0.000000,0.0,0.0,2,0,...,0.0,0.6,0.142857,0.200000,0.000000,0.00,0.0,0.4,0.4,0.0


In [10]:
from sklearn.model_selection import train_test_split

# Definir X (variables predictoras) y Y (variable objetivo)
X = df_modelo.drop(columns=["Resultado Local", "Resultado Visitante", "Marcador", "Goles Local", "Goles Visitante", "Prom_Goles_Favor_Local", "Prom_Goles_Favor_Visitante", "Fecha"])
Y = df_modelo["Resultado Local"]

# Dividir en conjunto de entrenamiento (70%) y prueba (30%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42, stratify=Y)


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Aplicamos la normalización solo a las variables numéricas
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [12]:
# 1️⃣ Regresión Logística
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Entrenar modelo
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_scaled, Y_train)

# Predicciones
Y_pred_logreg = logreg.predict(X_test_scaled)

# Evaluación
print("🔹 Regresión Logística 🔹")
print(f"Accuracy: {accuracy_score(Y_test, Y_pred_logreg):.4f}")
print(classification_report(Y_test, Y_pred_logreg))



🔹 Regresión Logística 🔹
Accuracy: 0.6545
              precision    recall  f1-score   support

           0       0.66      0.57      0.61       132
           1       0.59      0.48      0.53       170
           2       0.68      0.81      0.74       271

    accuracy                           0.65       573
   macro avg       0.64      0.62      0.63       573
weighted avg       0.65      0.65      0.65       573



In [13]:
# 2️⃣ Random Forest

from sklearn.ensemble import RandomForestClassifier

# Entrenar modelo
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, Y_train)  # ⚠️ No requiere normalización

# Predicciones
Y_pred_rf = rf.predict(X_test)

# Evaluación
print("🔹 Random Forest 🔹")
print(f"Accuracy: {accuracy_score(Y_test, Y_pred_rf):.4f}")
print(classification_report(Y_test, Y_pred_rf))


🔹 Random Forest 🔹
Accuracy: 0.6387
              precision    recall  f1-score   support

           0       0.57      0.55      0.56       132
           1       0.60      0.42      0.50       170
           2       0.68      0.82      0.74       271

    accuracy                           0.64       573
   macro avg       0.62      0.60      0.60       573
weighted avg       0.63      0.64      0.63       573



In [14]:
# 3️⃣ Red Neuronal (MLPClassifier)

from sklearn.neural_network import MLPClassifier

# Entrenar modelo
mlp = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, Y_train)

# Predicciones
Y_pred_mlp = mlp.predict(X_test_scaled)

# Evaluación
print("🔹 Red Neuronal (MLP) 🔹")
print(f"Accuracy: {accuracy_score(Y_test, Y_pred_mlp):.4f}")
print(classification_report(Y_test, Y_pred_mlp))


🔹 Red Neuronal (MLP) 🔹
Accuracy: 0.6056
              precision    recall  f1-score   support

           0       0.51      0.52      0.52       132
           1       0.54      0.48      0.51       170
           2       0.68      0.72      0.70       271

    accuracy                           0.61       573
   macro avg       0.58      0.58      0.58       573
weighted avg       0.60      0.61      0.60       573





In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Diccionario para almacenar los resultados
resultados = {}

# Evaluamos cada modelo
modelos = {
    "Regresión Logística": Y_pred_logreg,
    "Random Forest": Y_pred_rf,
    "Red Neuronal (MLP)": Y_pred_mlp
}

for nombre, pred in modelos.items():
    resultados[nombre] = {
        "Accuracy": accuracy_score(Y_test, pred),
        "Precision": precision_score(Y_test, pred, average="weighted"),
        "Recall": recall_score(Y_test, pred, average="weighted"),
        "F1-score": f1_score(Y_test, pred, average="weighted")
    }

# Mostramos los resultados
for modelo, metricas in resultados.items():
    print(f"\n🔹 {modelo} 🔹")
    for metrica, valor in metricas.items():
        print(f"{metrica}: {valor:.4f}")



🔹 Regresión Logística 🔹
Accuracy: 0.6545
Precision: 0.6487
Recall: 0.6545
F1-score: 0.6464

🔹 Random Forest 🔹
Accuracy: 0.6387
Precision: 0.6310
Recall: 0.6387
F1-score: 0.6273

🔹 Red Neuronal (MLP) 🔹
Accuracy: 0.6056
Precision: 0.6018
Recall: 0.6056
F1-score: 0.6029
