In [8]:
import os
os.environ['MPLBACKEND'] = 'Agg'  # Use Agg as a safe default
import matplotlib.pyplot as plt


# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

# Load the Dataset
file_path = "preprocessed_data/outfield_processed.csv"  
df = pd.read_csv(file_path)

# Prepare features and target
X = df.drop(columns=["Player", "player_market_value_euro"], errors="ignore")  
y = df["player_market_value_euro"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test_scaled)

In [9]:
# Evaluate Model Performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100  # Calculate MAPE
rmse = np.sqrt(mse)

print(f"\n📊 Model Performance Metrics:")
print(f"📌 Mean Absolute Error (MAE): {mae:.2f}")
print(f"📌 Mean Squared Error (MSE): {mse:.2f}")
print(f"📌 Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"📌 Mean Absolute Percentage Error: {mape:.2f}")
print(f"📌 R² Score: {r2:.2f}")

# Feature Importance using XGBoost model
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Print top 10 most important features with their importance scores
print("\n🔍 Top 10 Most Important Features:")
for idx, row in feature_importance.head(10).iterrows():
    print(f"{row['feature']}: {row['importance']:.4f}")


📊 Model Performance Metrics:
📌 Mean Absolute Error (MAE): 0.89
📌 Mean Squared Error (MSE): 1.23
📌 Root Mean Squared Error (RMSE): 1.11
📌 Mean Absolute Percentage Error: 5.66
📌 R² Score: 0.29

🔍 Top 10 Most Important Features:
age: 0.0654
Per90_Total Carrying Distance: 0.0460
Per90_npxG: Non-Penalty xG: 0.0351
Per90_Touches (Att 3rd): 0.0343
Per90_Progressive Carrying Distance: 0.0281
Per90_Non-Penalty Goals: 0.0279
Progressive_Play: 0.0255
Per90_Passes Completed (Short): 0.0250
Per90_GCA (Shot): 0.0241
Per90_Passes Attempted (Short): 0.0234
