In [22]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer

# Model testing

In [25]:
# Leer los datos desde el archivo CSV
df = pd.read_csv('final_dataset.csv')

# Crear una nueva columna combinada ordenando alfabéticamente los equipos
df['team_pair'] = df.apply(lambda row: "_".join(sorted([row['home_team_name'], row['away_team_name']])), axis=1)

# Crear un indicador de si el equipo es local
df['home_indicator'] = np.where(df['home_team_name'] < df['away_team_name'], 1, 0)

# Seleccionar las características y la variable objetivo
X = df[['team_pair', 'home_indicator', 'home_team_rank', 'away_team_rank', 'day_of_week', 'home_team_points', 'away_team_points']]
y = df['result']

# Crear un pipeline de preprocesamiento
preprocessor = ColumnTransformer(
    transformers=[
        ('team', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['team_pair']),
        ('rank', FunctionTransformer(lambda x: np.column_stack([
            np.cos(2 * np.pi * x / 20),
            np.sin(2 * np.pi * x / 20)
        ]), validate=True), ['home_team_rank', 'away_team_rank']),
        ('day', FunctionTransformer(lambda x: np.column_stack([
            np.cos(2 * np.pi * x / 7),
            np.sin(2 * np.pi * x / 7)
        ]), validate=True), ['day_of_week']),
        ('num', SimpleImputer(strategy='mean'), ['home_team_points', 'away_team_points'])
    ]
)

# Crear el pipeline completo con el modelo Random Forest
model_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42))
    ]
)

# Inicializar TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

# Listas para almacenar los resultados de cada split
scores_train = []
scores_test = []

# Inicializar la variable para almacenar el modelo ajustado del último split
last_model = None

# Iterar sobre los splits para entrenar el modelo
for split, (train_index, test_index) in enumerate(tscv.split(X), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Entrenar el modelo con el conjunto de entrenamiento
    model_pipeline.fit(X_train, y_train)
    
    # Guardar el modelo del último split
    if split == tscv.get_n_splits():
        last_model = model_pipeline.named_steps['classifier']
    
    # Predecir en el conjunto de entrenamiento y prueba
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)
    
    # Calcular la exactitud en entrenamiento y prueba
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # Almacenar los resultados
    scores_train.append(train_accuracy)
    scores_test.append(test_accuracy)

    # Mostrar los resultados del último split
    if split == tscv.get_n_splits():
        print(f"Last Split Train Accuracy: {train_accuracy:.4f}")
        print(f"Last Split Test Accuracy: {test_accuracy:.4f}")

# Mostrar los resultados promedios
print(f"Average Train Accuracy: {sum(scores_train)/len(scores_train):.4f}")
print(f"Average Test Accuracy: {sum(scores_test)/len(scores_test):.4f}")

Last Split Train Accuracy: 0.9976
Last Split Test Accuracy: 0.4664
Average Train Accuracy: 0.9995
Average Test Accuracy: 0.4482


In [26]:
# Obtener la importancia de las características después de haber ajustado el modelo
importances = last_model.feature_importances_

# Obtener las columnas procesadas del preprocesador
preprocessor = model_pipeline.named_steps['preprocessor']

# Verificar las columnas que el preprocesador maneja
print("Features processed by the preprocessor:")
print(preprocessor.transformers_)

# Obtener las columnas one-hot de las dos columnas categóricas
one_hot_columns = preprocessor.transformers_[0][1].get_feature_names_out()

# Agregar las otras columnas numéricas
numerical_columns = ['home_team_rank', 'away_team_rank', 'day_of_week', 'home_team_points', 'away_team_points']
feature_names = np.concatenate([one_hot_columns, numerical_columns])

# Verificar las longitudes
print(f"Feature names length: {len(feature_names)}")
print(f"Importances length: {len(importances)}")

# Si las longitudes coinciden, proceder con la creación del DataFrame
if len(feature_names) == len(importances):
    # Crear un DataFrame para mostrar la importancia
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # Mostrar la importancia de las características
    print(importance_df)
else:
    print("Error: La longitud de los nombres de las características no coincide con la longitud de los valores de importancia.")

# Verificar la longitud de los arrays
print("Feature names:")
print(feature_names)
print("Importances:")
print(importances)

Features processed by the preprocessor:
[('team', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['team_pair']), ('rank', FunctionTransformer(func=<function <lambda> at 0x00000165E940E0C0>,
                    validate=True), ['home_team_rank', 'away_team_rank']), ('day', FunctionTransformer(func=<function <lambda> at 0x00000165E940CA40>,
                    validate=True), ['day_of_week']), ('remainder', 'drop', [1])]
Feature names length: 305
Importances length: 306
Error: La longitud de los nombres de las características no coincide con la longitud de los valores de importancia.
Feature names:
['team_pair_Alaves_Almeria' 'team_pair_Alaves_Athletic_Club'
 'team_pair_Alaves_Atletico_Madrid' 'team_pair_Alaves_Barcelona'
 'team_pair_Alaves_Cadiz' 'team_pair_Alaves_Celta_Vigo'
 'team_pair_Alaves_Eibar' 'team_pair_Alaves_Elche'
 'team_pair_Alaves_Espanyol' 'team_pair_Alaves_Getafe'
 'team_pair_Alaves_Granada' 'team_pair_Alaves_Huesca'
 'team_pair_Alaves_Levante' 'team_pair_A