In [None]:
import pandas as pd
from google.colab import drive

def safe_load_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found. Please check the path.")
        return None
    except Exception as e:
        print(f"An error occurred while loading the CSV: {e}")
        return None

drive.mount('/content/drive')
path = "/content/drive/MyDrive/datasets_2/"
final_path = path + "master_file2_preprocessed_small.csv"
df = safe_load_csv(final_path)

if df is not None:
    print("Final dataset loaded:", df.shape)
else:
    print("Failed to load dataset.")

Mounted at /content/drive
Final dataset loaded: (50000, 161)


In [None]:
import numpy as np
from datetime import datetime

if "date_of_birth" in df.columns:
    df["date_of_birth"] = pd.to_datetime(df["date_of_birth"], errors='coerce')
    df["age"] = datetime.now().year - df["date_of_birth"].dt.year


In [None]:
if "injury_reason" in df.columns:
    df["is_injured"] = df["injury_reason"].apply(lambda x: 0 if pd.isna(x) else 1)


In [None]:
if ("goals" in df.columns) and ("appearances" in df.columns):
    df["goals_per_match"] = df["goals"] / (df["appearances"] + 1)


In [None]:
numeric_cols = df.select_dtypes(include=['float64','int64']).columns

z = np.abs((df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std())
# df_no_outliers = df[(z < 3).all(axis=1)] # Original aggressive outlier removal

# Temporarily bypassing aggressive outlier removal to allow model training to proceed
df_no_outliers = df.copy()

print("After outlier removal:", df_no_outliers.shape)

After outlier removal: (50000, 161)


In [None]:
target = "value"  # change if needed!

df_no_outliers = df_no_outliers.dropna(subset=[target])  # ensure no missing target

X = df_no_outliers.drop(columns=[target])
y = df_no_outliers[target]

print("Shapes:", X.shape, y.shape)

Shapes: (48693, 160) (48693,)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train:", X_train.shape, " Test:", X_test.shape)


Train: (38954, 160)  Test: (9739, 160)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')

X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_imputed, y_train)

y_pred = lr.predict(X_test_imputed)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("----- Linear Regression Performance -----")
print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R²:", r2)

 'tb_emotion' 'game_date' 'tweet_date' 'when']. At least one non-missing value is needed for imputation with strategy='mean'.
 'tb_emotion' 'game_date' 'tweet_date' 'when']. At least one non-missing value is needed for imputation with strategy='mean'.


----- Linear Regression Performance -----
MSE: 49156732348211.16
RMSE: 7011186.229748227
MAE: 2825116.8301904784
R²: 0.1516672078886694


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

poly = PolynomialFeatures(degree=1)
X_train_poly = poly.fit_transform(X_train_imputed)
X_test_poly = poly.transform(X_test_imputed)

model_poly_1 = LinearRegression()
model_poly_1.fit(X_train_poly, y_train)

y_pred_poly_1 = model_poly_1.predict(X_test_poly)
rmse_1 = np.sqrt(mean_squared_error(y_test, y_pred_poly_1))

print("Polynomial Degree 1 → RMSE:", rmse_1)


Polynomial Degree 1 → RMSE: 7011186.229744081


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 1: reduce features
# X_train_imputed is a numpy array, so select_dtypes is not applicable.
# We will directly select the first 20 columns from the imputed arrays.
X_train_small = X_train_imputed[:, :20]
X_test_small = X_test_imputed[:, :20]

print("Shapes:", X_train_small.shape, X_test_small.shape)

# Step 2: polynomial degree 2
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_small)
X_test_poly = poly.transform(X_test_small)

print("Poly shape:", X_train_poly.shape)

# Step 3: model
model_poly_2 = LinearRegression()
model_poly_2.fit(X_train_poly, y_train)

y_pred = model_poly_2.predict(X_test_poly)
rmse_2 = np.sqrt(mean_squared_error(y_test, y_pred))

print("Degree 2 RMSE:", rmse_2)


Shapes: (38954, 20) (9739, 20)
Poly shape: (38954, 231)
Degree 2 RMSE: 7189956.039213703


In [None]:
# X_train_imputed is a numpy array, so select_dtypes is not applicable.
# We will directly select the first 10 columns from the imputed arrays.
selected_cols = X_train_imputed[:, :10]

X_train_small = selected_cols
X_test_small = X_test_imputed[:, :10]

print("Using the first 10 features.")
print("Shapes:", X_train_small.shape, X_test_small.shape)

Using the first 10 features.
Shapes: (38954, 10) (9739, 10)


In [None]:
poly = PolynomialFeatures(degree=3)
X_train_poly = poly.fit_transform(X_train_small)
X_test_poly = poly.transform(X_test_small)

model_poly_3 = LinearRegression()
model_poly_3.fit(X_train_poly, y_train)

y_pred_poly_3 = model_poly_3.predict(X_test_poly)
rmse_3 = np.sqrt(mean_squared_error(y_test, y_pred_poly_3))

print("Polynomial Degree 3 → RMSE:", rmse_3)


Polynomial Degree 3 → RMSE: 7190105.800846882


In [None]:
# ============================================================
# RANDOM FOREST FEATURE SELECTION → LIGHTGBM (OPTIMIZED)
# ============================================================

import numpy as np
import pandas as pd
import joblib, time
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor
from scipy import sparse
import lightgbm
import os # Import the os module for directory operations

# ------------------------------------------------------------
# 1. TRAIN STRONG RANDOM FOREST (FULL FEATURES)
# ------------------------------------------------------------

print("\n=== TRAINING RANDOM FOREST (FULL FEATURES) ===")

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_split=2,
    n_jobs=-1,
    random_state=42
)

# Define feature_names from the original X columns before imputation
# Correcting feature_names to exclude columns skipped by SimpleImputer
skipped_imputer_cols = ['player_name_y', 'text', 'vader_polarity', 'vader_emotion', 'tb_polarity', 'tb_emotion', 'game_date', 'tweet_date', 'when']
feature_names = [col for col in X.columns.tolist() if col not in skipped_imputer_cols]

# RF prefers dense when feature count is manageable
X_train_rf = X_train_imputed.toarray() if sparse.isspmatrix(X_train_imputed) else X_train_imputed
X_test_rf  = X_test_imputed.toarray()  if sparse.isspmatrix(X_test_imputed) else X_test_imputed

t0 = time.time()
rf.fit(X_train_rf, y_train)
print("RF training time:", round(time.time() - t0, 2), "seconds")

y_rf = rf.predict(X_test_rf)
print("RF RMSE:", np.sqrt(mean_squared_error(y_test, y_rf)))
print("RF R² :", r2_score(y_test, y_rf))

# ------------------------------------------------------------
# 2. EXTRACT FEATURE IMPORTANCE
# ------------------------------------------------------------

rf_importance = pd.Series(
    rf.feature_importances_,
    index=feature_names
).sort_values(ascending=False)

print("\nTop 20 RF Features:")
print(rf_importance.head(20))

# ------------------------------------------------------------
# 3. SELECT TOP FEATURES (RF-BASED)
# ------------------------------------------------------------

# Strategy: keep features contributing to 95% cumulative importance
cumulative_importance = rf_importance.cumsum()
selected_features = cumulative_importance[cumulative_importance <= 0.95].index.tolist()

# Safety fallback
if len(selected_features) < 20:
    selected_features = rf_importance.head(30).index.tolist()

print("\nSelected RF Features Count:", len(selected_features))
print("Selected RF Features:\n", selected_features)

# ------------------------------------------------------------
# 4. REDUCE TRAIN / TEST MATRICES TO SELECTED FEATURES
# ------------------------------------------------------------

feature_index_map = {f: i for i, f in enumerate(feature_names)}
selected_indices = [feature_index_map[f] for f in selected_features]

X_train_sel = X_train_imputed[:, selected_indices]
X_test_sel  = X_test_imputed[:, selected_indices]

print("Reduced Train Shape:", X_train_sel.shape)
print("Reduced Test Shape :", X_test_sel.shape)

# ------------------------------------------------------------
# 5. TRAIN LIGHTGBM USING RF-SELECTED FEATURES
# ------------------------------------------------------------

print("\n=== TRAINING LIGHTGBM (RF-SELECTED FEATURES) ===")

lgb = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.03,
    max_depth=12,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=10,
    objective="regression",
    n_jobs=-1,
    random_state=42
)

t0 = time.time()
lgb.fit(
    X_train_sel,
    y_train,
    eval_set=[(X_test_sel, y_test)],
    eval_metric="rmse",
    callbacks=[lightgbm.early_stopping(50, verbose=False)]
)
print("LGB training time:", round(time.time() - t0, 2), "seconds")

y_lgb = lgb.predict(X_test_sel)

print("\n=== FINAL LIGHTGBM PERFORMANCE ===")
print("LGB RMSE:", np.sqrt(mean_squared_error(y_test, y_lgb)))
print("LGB R² :", r2_score(y_test, y_lgb))

# ------------------------------------------------------------
# 6. LIGHTGBM FEATURE IMPORTANCE (GAIN)
# ------------------------------------------------------------

lgb_importance = pd.Series(
    lgb.feature_importances_,
    index=selected_features
).sort_values(ascending=False)

print("\nTop 20 LightGBM Features:")
print(lgb_importance.head(20))

# ------------------------------------------------------------
# 7. SAVE ARTIFACTS
# ------------------------------------------------------------

# Define the directory path
models_dir = "/content/drive/MyDrive/datasets_2/models_final/"

# Create the directory if it doesn't exist
os.makedirs(models_dir, exist_ok=True)

joblib.dump(rf, os.path.join(models_dir, "best_rf.joblib"))
joblib.dump(lgb, os.path.join(models_dir, "best_lgb_rf_selected.joblib"))

pd.DataFrame({
    "feature": selected_features,
    "rf_importance": rf_importance[selected_features].values
}).to_csv(
    os.path.join(models_dir, "rf_selected_features.csv"),
    index=False
)

print("\nArtifacts saved:")
print("✓ best_rf.joblib")
print("✓ best_lgb_rf_selected.joblib")
print("✓ rf_selected_features.csv")


=== TRAINING RANDOM FOREST (FULL FEATURES) ===
RF training time: 171.76 seconds
RF RMSE: 2589493.891867249
RF R² : 0.8842787727126821

Top 20 RF Features:
current_club_id                           0.230224
contract_expires                          0.163748
player_id                                 0.143295
height                                    0.066780
player_agent_id                           0.058894
team_id                                   0.037636
goals                                     0.022705
outfitter_adidas                          0.020807
minutes_played                            0.018570
position_Midfield - Attacking Midfield    0.016613
position_Attack - Centre-Forward          0.016113
outfitter_Nike                            0.014941
outfitter_Skechers                        0.013381
position_Attack - Left Winger             0.012040
days_missed                               0.011930
games_missed                              0.010468
on_loan_from_club_id        




Artifacts saved:
✓ best_rf.joblib
✓ best_lgb_rf_selected.joblib
✓ rf_selected_features.csv
