Importaciones y carga de datos

In [None]:
import re
import numpy as np
import pandas as pd
import kagglehub
import joblib
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error


# Cargar tu dataset crudo (fifa19.csv)
path = kagglehub.dataset_download("javagarm/fifa-19-complete-player-dataset")


data = pd.read_csv(path+"/kl.csv", encoding='latin1')
data.head()


Using Colab cache for faster access to the 'fifa-19-complete-player-dataset' dataset.


Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31.0,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94.0,94,FC Barcelona,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,226.5M
1,1,20801,Cristiano Ronaldo,33.0,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94.0,94,Juventus,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,127.1M
2,2,190871,Neymar Jr,26.0,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92.0,93,Paris Saint-Germain,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,228.1M
3,3,193080,De Gea,27.0,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91.0,93,Manchester United,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,138.6M
4,4,192985,K. De Bruyne,27.0,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91.0,92,Manchester City,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,196.4M


Funciones utilitarias

In [None]:
def value_to_number(valor: str):
    if pd.isna(valor): return np.nan
    s = re.sub(r'[^0-9KMkm.\-]', '', str(valor)).strip()
    if s == '' or s == '0': return 0.0
    if 'M' in s or 'm' in s: return float(s.replace('M','').replace('m','')) * 1_000_000
    if 'K' in s or 'k' in s: return float(s.replace('K','').replace('k','')) * 1_000
    return float(s)

def height_to_cm(h):
    if isinstance(h, str) and "'" in h:
        try: feet, inches = h.split("'"); return float(feet)*30.48 + float(inches)*2.54
        except: return np.nan
    return np.nan

def weight_to_kg(w):
    if isinstance(w, str) and 'lbs' in w:
        try: return float(w.replace('lbs','')) * 0.453592
        except: return np.nan
    return np.nan


Subtransformadores

In [None]:
#Jugadores de campo
class FieldFeatureBuilder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.rate_map = {'Low':0, 'Medium':1, 'High':2}
        self.out_features = [
            'Age','Age2','HeightCM','WeightKG',
            'WorkRate_Att','WorkRate_Def',
            'Potential','Special','Log_WageUSD'
        ]
        self.medians_ = {}

    def fit(self, X, y=None):
        X = self._transform(X)
        self.medians_ = X.median().to_dict()
        return self

    def transform(self, X):
        X = self._transform(X)
        for c in self.out_features:
            X[c] = X[c].fillna(self.medians_.get(c, 0))
        return X[self.out_features].to_numpy()

    def _transform(self, X):
        X = pd.DataFrame(X).copy()
        X['WageUSD'] = X.get('Wage', np.nan).apply(value_to_number) * 1.12
        X['Log_WageUSD'] = np.log1p(X['WageUSD'])
        X['HeightCM'] = X.get('Height', np.nan).apply(height_to_cm)
        X['WeightKG'] = X.get('Weight', np.nan).apply(weight_to_kg)
        wr = X.get('Work Rate', 'Medium/Medium')
        wr_split = wr.astype(str).str.split('/', n=1, expand=True)
        X['WorkRate_Att'] = wr_split[0].str.strip().map(self.rate_map)
        X['WorkRate_Def'] = wr_split[1].str.strip().map(self.rate_map)
        X['Age'] = pd.to_numeric(X.get('Age'), errors='coerce').clip(15,45)
        X['Age2'] = X['Age']**2
        return X[self.out_features]
#Arqueros
class GKFeatureBuilder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.gk_features = ['GKDiving','GKHandling','GKKicking','GKPositioning','GKReflexes']
        self.medians_ = {}

    def fit(self, X, y=None):
        X = self._transform(X)
        self.medians_ = X.median().to_dict()
        return self

    def transform(self, X):
        X = self._transform(X)
        for c in self.gk_features:
            X[c] = X[c].fillna(self.medians_.get(c, 0))
        return X[self.gk_features].to_numpy()

    def _transform(self, X):
        X = pd.DataFrame(X).copy()
        for c in self.gk_features:
            X[c] = pd.to_numeric(X.get(c), errors='coerce')
        return X[self.gk_features]


Modelo combinado

In [None]:
class CombinedFifaModel(BaseEstimator, RegressorMixin):
    def __init__(self, rf_params_field=None, rf_params_gk=None):
        self.rf_params_field = rf_params_field or dict(
            n_estimators=200, max_depth=32, min_samples_split=10,
            min_samples_leaf=4, max_features=None, bootstrap=True,
            random_state=42, n_jobs=-1
        )
        self.rf_params_gk = rf_params_gk or dict(
            n_estimators=300, max_depth=18, min_samples_split=6,
            min_samples_leaf=2, max_features=None, bootstrap=True,
            random_state=42, n_jobs=-1
        )
        self.pipe_field = Pipeline([
            ('build', FieldFeatureBuilder()),
            ('model', RandomForestRegressor(**self.rf_params_field))
        ])
        self.pipe_gk = Pipeline([
            ('build', GKFeatureBuilder()),
            ('model', RandomForestRegressor(**self.rf_params_gk))
        ])


    def fit(self, X, y):
        X = pd.DataFrame(X).copy()
        y = pd.to_numeric(pd.Series(y), errors='coerce')

        # Filtrar filas sin Position o sin Overall
        base_mask = X['Position'].notna() & y.notna()
        X, y = X.loc[base_mask], y.loc[base_mask]

        mask_gk = X['Position'].eq('GK')

        # Entrenar campo si hay datos
        if (~mask_gk).any():
            self.pipe_field.fit(X.loc[~mask_gk], y.loc[~mask_gk])

        # Entrenar GK si hay datos
        if mask_gk.any():
            self.pipe_gk.fit(X.loc[mask_gk], y.loc[mask_gk])

        return self

    def predict(self, X):
        X = pd.DataFrame(X)
        mask_gk = X['Position'].eq('GK')
        preds = pd.Series(index=X.index, dtype=float)
        if (~mask_gk).any():
            preds.loc[~mask_gk] = self.pipe_field.predict(X.loc[~mask_gk])
        if (mask_gk).any():
            preds.loc[mask_gk] = self.pipe_gk.predict(X.loc[mask_gk])
        return preds


Entrenamiento y almacenamiento de pipeline

In [None]:
df_raw = data.copy()

X = df_raw
y = pd.to_numeric(df_raw['Overall'], errors='coerce')

# Eliminar filas con valores NaN en 'Overall'
nan_mask = y.isna()
X = X[~nan_mask]
y = y[~nan_mask]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe_fifa = CombinedFifaModel()
pipe_fifa.fit(X_train, y_train)

pred = pipe_fifa.predict(X_test)
r2 = r2_score(y_test, pred)
mae = mean_absolute_error(y_test, pred)
print(f"R²={r2:.3f}, MAE={mae:.3f}")

joblib.dump(pipe_fifa, "pipeline_fifa.pkl")
print("✅ Pipeline unificado guardado como pipeline_fifa.pkl")

R²=0.961, MAE=0.907
✅ Pipeline unificado guardado como pipeline_fifa.pkl


Prueba

In [None]:
pipe = joblib.load("pipeline_fifa.pkl")

# Ejemplo de uso con nuevas filas (directamente del CSV original)
nuevos_jugadores = df_raw.sample(3)
predicciones = pipe.predict(nuevos_jugadores)
pd.DataFrame({
    'Name': nuevos_jugadores['Name'],
    'Predicción Overall': predicciones
})


Unnamed: 0,Name,Predicción Overall
3461,F. Al Muwallad,71.895801
3632,N. Powell,72.520553
3315,A. Maher,71.134701



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

