In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from matplotlib import pyplot as plt
import seaborn as sns
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical

# Load the Dataset
file_path = "../preprocessed_data/outfield_processed.csv"  
df = pd.read_csv(file_path)

# Prepare features and target
X = df.drop(columns=["Player", "player_market_value_euro"], errors="ignore")  
y = df["player_market_value_euro"]

# Split Data into Training and Testing Sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the search space for Bayesian optimization
search_spaces = {
    'n_estimators': Integer(100, 500, name='n_estimators'),
    'max_depth': Integer(10, 50, name='max_depth'),
    'min_samples_split': Integer(2, 20, name='min_samples_split'),
    'min_samples_leaf': Integer(1, 10, name='min_samples_leaf'),
    'max_features': Categorical(['sqrt', 'log2'], name='max_features') 
}

# Create BayesSearchCV object
bayes_search = BayesSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    search_spaces=search_spaces,
    n_iter=50,  # Number of optimization iterations
    cv=5,
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    verbose=2,
    random_state=42
)

# Perform Bayesian optimization
print("Performing Bayesian optimization...")
bayes_search.fit(X_train, y_train)

# Get best parameters and score
print("\nBest parameters found:")
print(bayes_search.best_params_)
print(f"Best cross-validation score: {np.sqrt(-bayes_search.best_score_):.2f}")

# Use the best model for predictions
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)

Performing Bayesian optimization...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END max_depth=26, max_features=log2, min_samples_leaf=9, min_samples_split=8, n_estimators=368; total time=   4.6s
[CV] END max_depth=26, max_features=log2, min_samples_leaf=9, min_samples_split=8, n_estimators=368; total time=   4.6s
[CV] END max_depth=26, max_features=log2, min_samples_leaf=9, min_samples_split=8, n_estimators=368; total time=   4.7s
[CV] END max_depth=26, max_features=log2, min_samples_leaf=9, min_samples_split=8, n_estimators=368; total time=   4.7s
[CV] END max_depth=26, max_features=log2, min_samples_leaf=9, min_samples_split=8, n_estimators=368; total time=   4.7s
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END max_depth=43, max_features=log2, min_samples_leaf=4, min_samples_split=19, n_estimators=446; total time=   6.1s
[CV] END max_depth=43, max_features=log2, min_samples_leaf=4, min_samples_split=19, n_estimators=446; total time=   6.2s
[CV]



Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END max_depth=16, max_features=sqrt, min_samples_leaf=5, min_samples_split=18, n_estimators=142; total time=   2.5s
[CV] END max_depth=16, max_features=sqrt, min_samples_leaf=5, min_samples_split=18, n_estimators=142; total time=   2.5s
[CV] END max_depth=16, max_features=sqrt, min_samples_leaf=5, min_samples_split=18, n_estimators=142; total time=   2.5s
[CV] END max_depth=16, max_features=sqrt, min_samples_leaf=5, min_samples_split=18, n_estimators=142; total time=   2.5s
[CV] END max_depth=16, max_features=sqrt, min_samples_leaf=5, min_samples_split=18, n_estimators=142; total time=   2.5s




Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [2]:

# Evaluate Model Performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100  # Calculate MAPE
rmse = np.sqrt(mse)


print(f"\n📊 Model Performance Metrics:")
print(f"📌 Mean Absolute Error (MAE): {mae:.2f}")
print(f"📌 Mean Squared Error (MSE): {mse:.2f}")
print(f"📌 Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"📌 Mean Absolute Percentage Error: {mape:.2f}")
print(f"📌 R² Score: {r2:.2f}")

# Feature Importance using the best model
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Plot Feature Importance
plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Most Important Features (Optimized Model)')
plt.xlabel('Feature Importance')
plt.ylabel('Feature Name')
plt.tight_layout()
plt.show()

# Print top 10 most important features
print("\n🔍 Top 10 Most Important Features:")
print(feature_importance.head(10))

# Print the CV results
cv_results = pd.DataFrame(rf_random.cv_results_)
print("\nCV Results Summary:")
print(cv_results[['mean_test_score', 'std_test_score', 'rank_test_score']].head())

NameError: name 'y_test' is not defined