In [None]:
# ============================================================
# COMPLETE END-TO-END ML PIPELINE
# Train → Validate → Test + Cross Validation
# Models: Random Forest + LightGBM (RF Feature Selection)
# ============================================================

# ----------------------------
# 0. IMPORTS
# ----------------------------
import pandas as pd
import numpy as np
import os, time, joblib, warnings
from datetime import datetime

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor
import lightgbm

warnings.filterwarnings("ignore")

# ----------------------------
# 1. LOAD DATA
# ----------------------------
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/datasets_4/"
df = pd.read_csv(path + "master_file2_preprocessed_small.csv")

print("Dataset loaded:", df.shape)

# ----------------------------
# 2. FEATURE ENGINEERING (AS PER YOUR INPUT)
# ----------------------------
if "date_of_birth" in df.columns:
    df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], errors="coerce")
    df["age"] = datetime.now().year - df["date_of_birth"].dt.year

if "injury_reason" in df.columns:
    df["is_injured"] = df["injury_reason"].apply(lambda x: 0 if pd.isna(x) else 1)

if ("goals" in df.columns) and ("appearances" in df.columns):
    df["goals_per_match"] = df["goals"] / (df["appearances"] + 1)

# ----------------------------
# 3. CLEANING
# ----------------------------
TARGET = "value"
df = df.dropna(subset=[TARGET])

drop_cols = [
    "player_id", "date_unix", "Unnamed: 0",
    "text", "tweet_date", "game_date",
    "player_name_y"
]

df = df.drop(columns=[c for c in drop_cols if c in df.columns])
df = df.dropna(axis=1, how="all")

print("After cleaning:", df.shape)

X = df.drop(columns=[TARGET])
y = df[TARGET]

feature_names = X.columns.tolist()

# ----------------------------
# 4. TRAIN / VALIDATION / TEST SPLIT (60 / 20 / 20)
# ----------------------------
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=42
)

print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)

# ----------------------------
# 5. IMPUTATION (NUMERIC)
# ----------------------------
imputer = SimpleImputer(strategy="median")

X_train_imp = imputer.fit_transform(X_train)
X_val_imp   = imputer.transform(X_val)
X_test_imp  = imputer.transform(X_test)
X_full_imp  = imputer.fit_transform(X_train_full)

# ============================================================
# 6. RANDOM FOREST (BASE MODEL + CV)
# ============================================================
print("\n================ RANDOM FOREST =================")

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_split=2,
    n_jobs=-1,
    random_state=42
)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

# ---- Cross Validation (Train only)
rf_cv_rmse = -cross_val_score(
    rf, X_train_imp, y_train,
    scoring="neg_root_mean_squared_error",
    cv=cv, n_jobs=-1
)

print("RF CV RMSE (Train):", rf_cv_rmse.mean())

# ---- Train
rf.fit(X_train_imp, y_train)

# ---- Validation
y_val_rf = rf.predict(X_val_imp)
print("RF Validation RMSE:", np.sqrt(mean_squared_error(y_val, y_val_rf)))
print("RF Validation R² :", r2_score(y_val, y_val_rf))

# ---- Final Train (Train + Validation)
rf.fit(X_full_imp, y_train_full)

# ---- Test
y_test_rf = rf.predict(X_test_imp)
print("RF Test RMSE:", np.sqrt(mean_squared_error(y_test, y_test_rf)))
print("RF Test R² :", r2_score(y_test, y_test_rf))

# ============================================================
# 7. RF FEATURE SELECTION
# ============================================================
rf_importance = pd.Series(
    rf.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

cumulative = rf_importance.cumsum()
selected_features = cumulative[cumulative <= 0.95].index.tolist()

if len(selected_features) < 20:
    selected_features = rf_importance.head(30).index.tolist()

print("\nSelected RF Features:", len(selected_features))
print(selected_features)

# Reduce feature matrices
idx_map = {f: i for i, f in enumerate(feature_names)}
sel_idx = [idx_map[f] for f in selected_features]

X_train_sel = X_train_imp[:, sel_idx]
X_val_sel   = X_val_imp[:, sel_idx]
X_test_sel  = X_test_imp[:, sel_idx]
X_full_sel  = X_full_imp[:, sel_idx]

# ============================================================
# 8. LIGHTGBM (RF-SELECTED FEATURES + CV)
# ============================================================
print("\n================ LIGHTGBM =================")

lgb = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=12,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=10,
    objective="regression",
    n_jobs=-1,
    random_state=42
)

# ---- Cross Validation (Train only)
lgb_cv_rmse = -cross_val_score(
    lgb, X_train_sel, y_train,
    scoring="neg_root_mean_squared_error",
    cv=cv, n_jobs=-1
)

print("LGB CV RMSE (Train):", lgb_cv_rmse.mean())

# ---- Train
lgb.fit(X_train_sel, y_train)

# ---- Validation
y_val_lgb = lgb.predict(X_val_sel)
print("LGB Validation RMSE:", np.sqrt(mean_squared_error(y_val, y_val_lgb)))
print("LGB Validation R² :", r2_score(y_val, y_val_lgb))

# ---- Final Train (Train + Validation)
lgb.fit(
    X_full_sel, y_train_full,
    eval_set=[(X_test_sel, y_test)],
    eval_metric="rmse",
    callbacks=[lightgbm.early_stopping(50, verbose=False)]
)

# ---- Test
y_test_lgb = lgb.predict(X_test_sel)
print("LGB Test RMSE:", np.sqrt(mean_squared_error(y_test, y_test_lgb)))
print("LGB Test R² :", r2_score(y_test, y_test_lgb))

# ============================================================
# 9. SAVE MODELS & FEATURES
# ============================================================
models_dir = path + "models_final/"
os.makedirs(models_dir, exist_ok=True)

joblib.dump(rf, models_dir + "random_forest_final.joblib")
joblib.dump(lgb, models_dir + "lightgbm_rf_selected_final.joblib")

pd.DataFrame({
    "feature": selected_features,
    "rf_importance": rf_importance[selected_features].values
}).to_csv(models_dir + "rf_selected_features.csv", index=False)

print("\n✔ ALL MODELS AND FEATURES SAVED SUCCESSFULLY")


Mounted at /content/drive
Dataset loaded: (50000, 161)
After cleaning: (48693, 151)
Train: (29215, 150)
Validation: (9739, 150)
Test: (9739, 150)

RF CV RMSE (Train): 3963221.223524092
RF Validation RMSE: 3751508.910542535
RF Validation R² : 0.7853036336011671
RF Test RMSE: 2451479.6815557266
RF Test R² : 0.8962854123091065

Selected RF Features: 32
['current_club_id', 'contract_expires', 'player_agent_id', 'height', 'position_Attack - Centre-Forward', 'team_id', 'outfitter_adidas', 'position_Midfield - Attacking Midfield', 'minutes_played', 'outfitter_Nike', 'outfitter_Skechers', 'position_Attack - Right Winger', 'days_missed', 'goals', 'on_loan_from_club_id', 'position_Attack - Left Winger', 'season_name_x', 'main_position_Attack', 'games_missed', 'position_Midfield - Defensive Midfield', 'position_Midfield - Central Midfield', 'foot_left', 'nb_on_pitch', 'season_name_y_24/25', 'subed_in', 'assists', 'foot_both', 'main_position_Midfield', 'nb_in_group', 'position_Defender - Centre-Ba