In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

# Load your cleaned dataset
df = pd.read_csv("master_dataset_imputed_v1.csv.gz", compression="gzip")

df.shape


(92671, 63)

In [2]:
def create_base_transformations(df):
    df = df.copy()
    
    df['date_of_birth'] = pd.to_datetime(df.get('date_of_birth'), errors='coerce')
    df['joined'] = pd.to_datetime(df.get('joined'), errors='coerce')
    df['contract_expires'] = pd.to_datetime(df.get('contract_expires'), errors='coerce')
    
    ref = pd.Timestamp.today()
    df['age'] = np.floor((ref - df['date_of_birth']).dt.days / 365).replace([np.inf, -np.inf], np.nan)

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for c in num_cols:
        if c == "market_value":
            continue
        
        if df[c].dropna().shape[0] > 0:
            sk = df[c].dropna().skew()
            if abs(sk) > 1 and (df[c].dropna() > 0).all():
                df[f"{c}_log"] = np.log1p(df[c])
    
    return df

df = create_base_transformations(df)
df.shape


  df['contract_expires'] = pd.to_datetime(df.get('contract_expires'), errors='coerce')


(92671, 66)

In [3]:
def build_football_features(df):
    df = df.copy()

    id_col = "player_id" if "player_id" in df.columns else None

    if id_col:
        rolling_cols = [
            "perf_goals","perf_assists","perf_minutes_played",
            "goals_per90","assists_per90"
        ]
        
        for col in rolling_cols:
            if col in df.columns:
                df[f"{col}_rolling_5"] = df.groupby(id_col)[col].transform(lambda x: x.rolling(5, min_periods=1).mean())
                df[f"{col}_rolling_10"] = df.groupby(id_col)[col].transform(lambda x: x.rolling(10, min_periods=1).mean())

    if "goals_per90" in df.columns and "assists_per90" in df.columns:
        df["goal_involv_per90"] = df["goals_per90"].fillna(0) + df["assists_per90"].fillna(0)

    if "inj_total_days_out" in df.columns and "inj_mean_days_out" in df.columns:
        df["injury_risk"] = df["inj_total_days_out"].fillna(0) * df["inj_mean_days_out"].fillna(0)

    if "pv_mv_current" in df.columns and "pv_mv_max" in df.columns:
        df["mv_gap_from_peak"] = df["pv_mv_max"] - df["pv_mv_current"]
        df["mv_pct_from_peak"] = df["mv_gap_from_peak"] / df["pv_mv_max"].replace(0, np.nan)

    if "joined" in df.columns and "date_of_birth" in df.columns:
        df["age_at_join"] = np.floor((df["joined"] - df["date_of_birth"]).dt.days / 365)

    position_map = {
        "Forward": 3, "Attacker": 3, "Winger": 3, "Striker": 3,
        "Midfielder": 2, "CM": 2, "AM": 2,
        "Defender": 1, "CB": 1, "LB": 1, "RB": 1,
        "Goalkeeper": 0, "GK": 0
    }

    if "main_position" in df.columns:
        df["position_code"] = df["main_position"].map(position_map).fillna(1)

    if "tweet_count" in df.columns and "sentiment_score_norm" in df.columns:
        df["social_hype"] = df["tweet_count"].fillna(0) * df["sentiment_score_norm"].fillna(0)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df

df = build_football_features(df)
df.shape


(92671, 83)

In [5]:
if 'df' not in globals():
    raise RuntimeError("DataFrame 'df' not found. Run earlier FE cells first.")

target_col = 'pv_mv_current'
if target_col not in df.columns:
    raise KeyError(f"target_col '{target_col}' not in df columns. Available numeric columns: {df.select_dtypes(include=[np.number]).columns.tolist()}")

def _cv_target_encode_series(series, y, n_splits=5, smoothing=1.0, seed=42):
    result = pd.Series(index=series.index, dtype=float)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    global_mean = float(y.mean())
    for train_idx, val_idx in kf.split(series):
        s_tr = series.iloc[train_idx]
        y_tr = y.iloc[train_idx]
        means = y_tr.groupby(s_tr).mean()
        counts = y_tr.groupby(s_tr).size()
        smooth = (means * counts + global_mean * smoothing) / (counts + smoothing)
        result.iloc[val_idx] = series.iloc[val_idx].map(smooth).fillna(global_mean)
    return result.fillna(global_mean)

def encode_categoricals_target(df, target_col, n_splits=5, smoothing=1.0):
    df = df.copy()
    cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
    if target_col in cat_cols:
        cat_cols.remove(target_col)
    if len(cat_cols) == 0:
        print("[INFO] No categorical columns to encode.")
        return df, {}
    y = pd.Series(df[target_col]).reset_index(drop=True)
    df_index = df.index
    encoders = {}
    print(f"[INFO] Encoding {len(cat_cols)} categorical columns using target: '{target_col}'")
    for col in cat_cols:
        s = df[col].fillna("___NA___").astype(str).reset_index(drop=True)
        enc = _cv_target_encode_series(s, y, n_splits=n_splits, smoothing=smoothing)
        enc.index = df_index
        te_name = f"{col}_te"
        df[te_name] = enc
        encoders[col] = te_name
    df.drop(columns=cat_cols, inplace=True)
    return df, encoders

df_encoded, encoders = encode_categoricals_target(df, target_col=target_col, n_splits=5)
print("Encoded columns count:", len(encoders))
print("Sample encoded columns:", list(encoders.items())[:10])

out_file = "master_dataset_fe_v1_encoded.csv.gz"
df_encoded.to_csv(out_file, index=False, compression='gzip')
print("Saved encoded FE dataset to:", out_file)


[INFO] Encoding 13 categorical columns using target: 'pv_mv_current'
Encoded columns count: 13
Sample encoded columns: [('player_name', 'player_name_te'), ('country_of_birth', 'country_of_birth_te'), ('citizenship', 'citizenship_te'), ('position', 'position_te'), ('main_position', 'main_position_te'), ('foot', 'foot_te'), ('current_club_name', 'current_club_name_te'), ('tr_first_transfer_date', 'tr_first_transfer_date_te'), ('tr_last_transfer_date', 'tr_last_transfer_date_te'), ('team_club_name', 'team_club_name_te')]
Saved encoded FE dataset to: master_dataset_fe_v1_encoded.csv.gz


In [6]:
# Cell: quick checks
print("Shape:", df_encoded.shape)
display(df_encoded.head())
missing = pd.DataFrame({
    "Missing_Count": df_encoded.isnull().sum(),
    "Missing_Pct": (df_encoded.isnull().mean() * 100).round(2)
}).sort_values("Missing_Pct", ascending=False)
display(missing.head(20))


Shape: (92671, 83)


Unnamed: 0,player_id,date_of_birth,height,is_eu,current_club_id,joined,contract_expires,player_agent_id,social_media_url_has_url,second_club_url_has_url,...,position_te,main_position_te,foot_te,current_club_name_te,tr_first_transfer_date_te,tr_last_transfer_date_te,team_club_name_te,team_country_name_te,team_competition_id_te,_player_key_te
0,1,1980-09-23,0.0,True,123,2017-01-07,NaT,0.0,0,0,...,966700.0,717035.157715,28314.428496,43.87111,7408.799696,738496.919038,53829.52,53829.520243,53829.52,570477.576588
1,100011,1988-08-04,171.0,False,515,2025-01-03,NaT,8373.0,0,0,...,524205.4,539456.58275,748294.798072,64216.18,164257.739992,46238.116563,53829.52,53829.520243,53829.52,570477.576588
2,10,1978-06-09,184.0,True,123,2016-01-07,NaT,1126.0,1,0,...,615674.1,728567.306046,723256.677195,43.95573,1634.606237,213381.553291,53826.87,53826.870403,53826.87,570477.576588
3,10001,1981-10-12,183.0,True,123,2013-01-07,NaT,0.0,0,0,...,524205.4,539456.58275,545689.002696,43.87111,24673.044981,115109.344533,53829.52,53829.520243,53829.52,570477.576588
4,100001,1991-03-16,180.0,False,14554,NaT,2025-12-31,5240.0,1,0,...,1029637.0,718581.94216,732643.251275,1051575.0,902424.436393,143031.838439,1051575.0,516048.677653,1180506.0,570477.576588


Unnamed: 0,Missing_Count,Missing_Pct
contract_expires,54005,58.28
age_at_join,26042,28.1
joined,25317,27.32
mv_pct_from_peak,23267,25.11
age,1006,1.09
date_of_birth,1006,1.09
height,0,0.0
player_agent_id,0,0.0
player_id,0,0.0
current_club_id,0,0.0
