# EPL Player Stats Prediction (Goals & Assists)

This notebook predicts player performance metrics including goals, assists, and identifies top performers in the English Premier League.

**Prediction Targets**:
- Total Goals (regression)
- Total Assists (regression)
- Top Scorer Classification (>15 goals)
- Top Assister Classification (>10 assists)

**Dataset**: Goals & Assist.xlsx (2,274 players with 34 features)

## 1. Import Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# XGBoost
try:
    import xgboost as xgb
    xgb_available = True
except ImportError:
    xgb_available = False
    print("XGBoost not available. Install with: pip install xgboost")

# Settings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì Libraries imported successfully!")
print(f"‚úì Random seed set to {RANDOM_STATE}")

## 2. Load and Explore Dataset

In [None]:
print("=" * 70)
print(" LOADING AND EXPLORING DATASET")
print("=" * 70)

# Load the dataset
df = pd.read_excel('../Data/Goals & Assist.xlsx')

print(f"\n‚úì Dataset loaded successfully!")
print(f"  Shape: {df.shape[0]:,} players √ó {df.shape[1]} features")
print(f"\n{df.head(10)}")

In [None]:
# Dataset info
print("\n" + "=" * 70)
print(" DATASET INFORMATION")
print("=" * 70)

print(f"\nColumns ({len(df.columns)}):")
for i, col in enumerate(df.columns, 1):
    dtype = df[col].dtype
    unique = df[col].nunique()
    print(f"  {i:2d}. {col:40s} [{dtype}] - {unique} unique values")

print(f"\n\nData Types:\n{df.dtypes.value_counts()}")
print(f"\n\nMissing Values: {df.isnull().sum().sum()} total")

if df.isnull().sum().sum() > 0:
    print(f"\nColumns with missing values:")
    missing = df.isnull().sum()
    print(missing[missing > 0])

In [None]:
# Statistical summary
print("\n" + "=" * 70)
print(" STATISTICAL SUMMARY")
print("=" * 70)

print("\nNumerical Features Summary:")
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(df[numerical_cols].describe().T)

## 3. Data Cleaning and Preprocessing

In [None]:
print("=" * 70)
print(" DATA CLEANING")
print("=" * 70)

# Remove unnecessary columns
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)
    print("\n‚úì Removed 'Unnamed: 0' column")

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\n‚úì Duplicate rows: {duplicates}")

if duplicates > 0:
    df = df.drop_duplicates()
    print(f"  Removed {duplicates} duplicate rows")

# Filter players with minimal playing time (less than 5 matches)
if 'Matches Played' in df.columns:
    initial_count = len(df)
    df = df[df['Matches Played'] >= 5]
    removed = initial_count - len(df)
    print(f"\n‚úì Removed {removed} players with < 5 matches played")

# Fill missing values with 0 (missing means no activity)
df = df.fillna(0)
print(f"\n‚úì Missing values handled")

print(f"\n‚úì Clean dataset shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

## 4. Exploratory Data Analysis (EDA)

### 4.1 Target Variables Analysis

In [None]:
print("=" * 70)
print(" TARGET VARIABLES ANALYSIS")
print("=" * 70)

# Analyze Goals and Assists
if 'Goals' in df.columns and 'Assists' in df.columns:
    print("\nGoals Statistics:")
    print(f"  Mean: {df['Goals'].mean():.2f}")
    print(f"  Median: {df['Goals'].median():.0f}")
    print(f"  Max: {df['Goals'].max():.0f}")
    print(f"  Players with 15+ goals: {(df['Goals'] >= 15).sum()}")
    
    print("\nAssists Statistics:")
    print(f"  Mean: {df['Assists'].mean():.2f}")
    print(f"  Median: {df['Assists'].median():.0f}")
    print(f"  Max: {df['Assists'].max():.0f}")
    print(f"  Players with 10+ assists: {(df['Assists'] >= 10).sum()}")
    
    # Top scorers
    print("\nTop 10 Goal Scorers:")
    if 'Player' in df.columns:
        top_scorers = df.nlargest(10, 'Goals')[['Player', 'Goals', 'Assists']]
        print(top_scorers.to_string(index=False))
    
    # Top assisters
    print("\nTop 10 Assist Providers:")
    if 'Player' in df.columns:
        top_assisters = df.nlargest(10, 'Assists')[['Player', 'Goals', 'Assists']]
        print(top_assisters.to_string(index=False))

In [None]:
# Visualize Goals and Assists distribution
if 'Goals' in df.columns and 'Assists' in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Goals distribution
    axes[0, 0].hist(df['Goals'], bins=30, color='#2ecc71', alpha=0.7, edgecolor='black')
    axes[0, 0].set_title('Goals Distribution', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Goals', fontsize=10)
    axes[0, 0].set_ylabel('Number of Players', fontsize=10)
    axes[0, 0].axvline(df['Goals'].mean(), color='red', linestyle='--', 
                       label=f'Mean: {df["Goals"].mean():.2f}')
    axes[0, 0].legend()
    axes[0, 0].grid(axis='y', alpha=0.3)
    
    # Assists distribution
    axes[0, 1].hist(df['Assists'], bins=30, color='#3498db', alpha=0.7, edgecolor='black')
    axes[0, 1].set_title('Assists Distribution', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('Assists', fontsize=10)
    axes[0, 1].set_ylabel('Number of Players', fontsize=10)
    axes[0, 1].axvline(df['Assists'].mean(), color='red', linestyle='--',
                       label=f'Mean: {df["Assists"].mean():.2f}')
    axes[0, 1].legend()
    axes[0, 1].grid(axis='y', alpha=0.3)
    
    # Goals vs Assists scatter
    axes[1, 0].scatter(df['Goals'], df['Assists'], alpha=0.5, color='#9b59b6')
    axes[1, 0].set_title('Goals vs Assists', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('Goals', fontsize=10)
    axes[1, 0].set_ylabel('Assists', fontsize=10)
    axes[1, 0].grid(alpha=0.3)
    
    # Combined Goals + Assists
    df['Goals_Plus_Assists'] = df['Goals'] + df['Assists']
    axes[1, 1].hist(df['Goals_Plus_Assists'], bins=30, color='#e74c3c', alpha=0.7, edgecolor='black')
    axes[1, 1].set_title('Goals + Assists Distribution', fontsize=12, fontweight='bold')
    axes[1, 1].set_xlabel('Goals + Assists', fontsize=10)
    axes[1, 1].set_ylabel('Number of Players', fontsize=10)
    axes[1, 1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

### 4.2 Performance by Position

In [None]:
print("\n" + "=" * 70)
print(" PERFORMANCE BY POSITION")
print("=" * 70)

if 'Position' in df.columns:
    print("\nPlayers by Position:")
    print(df['Position'].value_counts())
    
    print("\nAverage Stats by Position:")
    position_stats = df.groupby('Position')[['Goals', 'Assists', 'Minutes']].mean()
    print(position_stats.round(2))
    
    # Visualize
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Goals by position
    position_goals = df.groupby('Position')['Goals'].mean().sort_values(ascending=False)
    axes[0].bar(range(len(position_goals)), position_goals.values, 
               color='#2ecc71', alpha=0.8)
    axes[0].set_xticks(range(len(position_goals)))
    axes[0].set_xticklabels(position_goals.index, rotation=45, ha='right')
    axes[0].set_title('Average Goals by Position', fontsize=12, fontweight='bold')
    axes[0].set_ylabel('Average Goals', fontsize=10)
    axes[0].grid(axis='y', alpha=0.3)
    
    # Assists by position
    position_assists = df.groupby('Position')['Assists'].mean().sort_values(ascending=False)
    axes[1].bar(range(len(position_assists)), position_assists.values,
               color='#3498db', alpha=0.8)
    axes[1].set_xticks(range(len(position_assists)))
    axes[1].set_xticklabels(position_assists.index, rotation=45, ha='right')
    axes[1].set_title('Average Assists by Position', fontsize=12, fontweight='bold')
    axes[1].set_ylabel('Average Assists', fontsize=10)
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

### 4.3 Age Analysis

In [None]:
print("\n" + "=" * 70)
print(" AGE ANALYSIS")
print("=" * 70)

if 'Age' in df.columns:
    print(f"\nAge Statistics:")
    print(f"  Mean age: {df['Age'].mean():.1f} years")
    print(f"  Median age: {df['Age'].median():.0f} years")
    print(f"  Age range: {df['Age'].min():.0f} - {df['Age'].max():.0f} years")
    
    # Age vs Performance
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Age vs Goals
    axes[0].scatter(df['Age'], df['Goals'], alpha=0.5, color='#2ecc71')
    axes[0].set_title('Age vs Goals', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Age', fontsize=10)
    axes[0].set_ylabel('Goals', fontsize=10)
    axes[0].grid(alpha=0.3)
    
    # Age vs Assists
    axes[1].scatter(df['Age'], df['Assists'], alpha=0.5, color='#3498db')
    axes[1].set_title('Age vs Assists', fontsize=12, fontweight='bold')
    axes[1].set_xlabel('Age', fontsize=10)
    axes[1].set_ylabel('Assists', fontsize=10)
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

### 4.4 Expected Goals (xG) Analysis

In [None]:
print("\n" + "=" * 70)
print(" EXPECTED GOALS (xG) ANALYSIS")
print("=" * 70)

# Find xG columns
xg_cols = [col for col in df.columns if 'xG' in col or 'xAG' in col]

if len(xg_cols) > 0:
    print(f"\nExpected metrics available: {xg_cols}")
    
    # Check if xG Per 90 exists
    if 'xG Per 90' in df.columns and 'Goals Per 90' in df.columns:
        # xG vs Actual Goals correlation
        correlation = df['xG Per 90'].corr(df['Goals Per 90'])
        print(f"\nCorrelation between xG Per 90 and Goals Per 90: {correlation:.3f}")
        
        # Visualize xG vs Actual Goals
        plt.figure(figsize=(10, 6))
        plt.scatter(df['xG Per 90'], df['Goals Per 90'], alpha=0.5, color='#9b59b6')
        plt.plot([0, df['xG Per 90'].max()], [0, df['xG Per 90'].max()], 
                'r--', label='Perfect prediction line')
        plt.title('Expected Goals vs Actual Goals (Per 90 Minutes)', 
                 fontsize=14, fontweight='bold')
        plt.xlabel('xG Per 90', fontsize=12)
        plt.ylabel('Goals Per 90', fontsize=12)
        plt.legend()
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        # Find overperformers and underperformers
        df['xG_Difference'] = df['Goals Per 90'] - df['xG Per 90']
        
        print("\nTop 5 Overperformers (Goals > xG):")
        if 'Player' in df.columns:
            overperformers = df.nlargest(5, 'xG_Difference')[['Player', 'Goals Per 90', 'xG Per 90', 'xG_Difference']]
            print(overperformers.to_string(index=False))
        
        print("\nTop 5 Underperformers (xG > Goals):")
        if 'Player' in df.columns:
            underperformers = df.nsmallest(5, 'xG_Difference')[['Player', 'Goals Per 90', 'xG Per 90', 'xG_Difference']]
            print(underperformers.to_string(index=False))
else:
    print("\n‚ö† No xG metrics found in dataset")

### 4.5 Correlation Analysis

In [None]:
print("\n" + "=" * 70)
print(" CORRELATION ANALYSIS")
print("=" * 70)

# Select important numerical features
important_features = ['Age', 'Matches Played', 'Starts', 'Minutes', '90s Played',
                     'Goals', 'Assists', 'Goals Per 90', 'Assists Per 90']
available_features = [f for f in important_features if f in df.columns]

if len(available_features) > 3:
    correlation_matrix = df[available_features].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()
    
    print("\n‚úì Correlation analysis complete")

## 5. Feature Engineering

In [None]:
print("=" * 70)
print(" FEATURE ENGINEERING")
print("=" * 70)

# Create a copy for feature engineering
df_model = df.copy()

# Encode Position
if 'Position' in df_model.columns:
    le_position = LabelEncoder()
    df_model['Position_Encoded'] = le_position.fit_transform(df_model['Position'])
    print("\n‚úì Encoded Position")

# Create efficiency metrics
if 'Goals' in df_model.columns and 'Minutes' in df_model.columns:
    df_model['Goals_Per_Minute'] = df_model['Goals'] / (df_model['Minutes'] + 1)
    print("‚úì Created Goals_Per_Minute feature")

if 'Assists' in df_model.columns and 'Minutes' in df_model.columns:
    df_model['Assists_Per_Minute'] = df_model['Assists'] / (df_model['Minutes'] + 1)
    print("‚úì Created Assists_Per_Minute feature")

# Create age categories
if 'Age' in df_model.columns:
    df_model['Age_Category'] = pd.cut(df_model['Age'], 
                                      bins=[0, 23, 28, 100],
                                      labels=[0, 1, 2])  # 0=Young, 1=Prime, 2=Veteran
    df_model['Age_Category'] = df_model['Age_Category'].astype(int)
    print("‚úì Created Age_Category feature (0=Young, 1=Prime, 2=Veteran)")

# Create playing time ratio
if 'Starts' in df_model.columns and 'Matches Played' in df_model.columns:
    df_model['Starting_Ratio'] = df_model['Starts'] / (df_model['Matches Played'] + 1)
    print("‚úì Created Starting_Ratio feature")

# Create productivity score
if 'Goals' in df_model.columns and 'Assists' in df_model.columns and '90s Played' in df_model.columns:
    df_model['Productivity_Score'] = (df_model['Goals'] + df_model['Assists']) / (df_model['90s Played'] + 1)
    print("‚úì Created Productivity_Score feature")

# Create target variables for classification
if 'Goals' in df_model.columns:
    df_model['Top_Scorer'] = (df_model['Goals'] >= 15).astype(int)
    print("‚úì Created Top_Scorer binary target (15+ goals)")

if 'Assists' in df_model.columns:
    df_model['Top_Assister'] = (df_model['Assists'] >= 10).astype(int)
    print("‚úì Created Top_Assister binary target (10+ assists)")

print(f"\n‚úì Feature engineering complete")
print(f"  New dataset shape: {df_model.shape[0]:,} rows √ó {df_model.shape[1]} columns")

## 6. Prepare Data for Modeling

In [None]:
print("=" * 70)
print(" FEATURE SELECTION FOR MODELING")
print("=" * 70)

# Define feature columns (excluding target and non-predictive columns)
exclude_cols = ['Player', 'Nation', 'Position', 'Goals', 'Assists', 'Top_Scorer', 
                'Top_Assister', 'Goals_Plus_Assists', 'xG_Difference',
                'Goals Per 90', 'Assists Per 90']  # Exclude per 90 stats to avoid data leakage

feature_cols = [col for col in df_model.columns if col not in exclude_cols]

# Ensure all feature columns are numerical
feature_cols = [col for col in feature_cols if df_model[col].dtype in ['int64', 'float64', 'int32', 'float32']]

print(f"\nSelected {len(feature_cols)} features for modeling:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:2d}. {col}")

# Prepare feature matrix
X = df_model[feature_cols].copy()

# Replace any inf values with 0
X = X.replace([np.inf, -np.inf], 0)

print(f"\n‚úì Feature matrix X shape: {X.shape}")

## 7. Goals Prediction (Regression)

### 7.1 Prepare Data for Goals Prediction

In [None]:
print("=" * 70)
print(" GOALS PREDICTION - DATA PREPARATION")
print("=" * 70)

# Target variable
y_goals = df_model['Goals'].copy()

print(f"\nTarget (Goals) statistics:")
print(f"  Mean: {y_goals.mean():.2f}")
print(f"  Std: {y_goals.std():.2f}")
print(f"  Min: {y_goals.min():.0f}")
print(f"  Max: {y_goals.max():.0f}")

# Train-test split
X_train_goals, X_test_goals, y_train_goals, y_test_goals = train_test_split(
    X, y_goals, test_size=0.2, random_state=RANDOM_STATE
)

print(f"\n‚úì Data split completed")
print(f"  Training set: {X_train_goals.shape}")
print(f"  Test set: {X_test_goals.shape}")

# Feature scaling
scaler_goals = StandardScaler()
X_train_goals_scaled = scaler_goals.fit_transform(X_train_goals)
X_test_goals_scaled = scaler_goals.transform(X_test_goals)

print(f"\n‚úì Features scaled")

### 7.2 Train Goals Prediction Models

In [None]:
print("\n" + "=" * 70)
print(" TRAINING GOALS PREDICTION MODELS")
print("=" * 70)

# Linear Regression
print("\n1. Linear Regression:")
lr_goals = LinearRegression()
lr_goals.fit(X_train_goals_scaled, y_train_goals)

lr_goals_pred = lr_goals.predict(X_test_goals_scaled)
lr_goals_mae = mean_absolute_error(y_test_goals, lr_goals_pred)
lr_goals_rmse = np.sqrt(mean_squared_error(y_test_goals, lr_goals_pred))
lr_goals_r2 = r2_score(y_test_goals, lr_goals_pred)

print(f"  MAE: {lr_goals_mae:.2f}")
print(f"  RMSE: {lr_goals_rmse:.2f}")
print(f"  R¬≤ Score: {lr_goals_r2:.4f}")

# Ridge Regression
print("\n2. Ridge Regression:")
ridge_goals = Ridge(alpha=1.0, random_state=RANDOM_STATE)
ridge_goals.fit(X_train_goals_scaled, y_train_goals)

ridge_goals_pred = ridge_goals.predict(X_test_goals_scaled)
ridge_goals_mae = mean_absolute_error(y_test_goals, ridge_goals_pred)
ridge_goals_rmse = np.sqrt(mean_squared_error(y_test_goals, ridge_goals_pred))
ridge_goals_r2 = r2_score(y_test_goals, ridge_goals_pred)

print(f"  MAE: {ridge_goals_mae:.2f}")
print(f"  RMSE: {ridge_goals_rmse:.2f}")
print(f"  R¬≤ Score: {ridge_goals_r2:.4f}")

# Random Forest Regressor
print("\n3. Random Forest Regressor:")
rf_goals = RandomForestRegressor(n_estimators=100, max_depth=10,
                                random_state=RANDOM_STATE, n_jobs=-1)
rf_goals.fit(X_train_goals, y_train_goals)

rf_goals_pred = rf_goals.predict(X_test_goals)
rf_goals_mae = mean_absolute_error(y_test_goals, rf_goals_pred)
rf_goals_rmse = np.sqrt(mean_squared_error(y_test_goals, rf_goals_pred))
rf_goals_r2 = r2_score(y_test_goals, rf_goals_pred)

print(f"  MAE: {rf_goals_mae:.2f}")
print(f"  RMSE: {rf_goals_rmse:.2f}")
print(f"  R¬≤ Score: {rf_goals_r2:.4f}")

# Gradient Boosting Regressor
print("\n4. Gradient Boosting Regressor:")
gb_goals = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                     max_depth=5, random_state=RANDOM_STATE)
gb_goals.fit(X_train_goals, y_train_goals)

gb_goals_pred = gb_goals.predict(X_test_goals)
gb_goals_mae = mean_absolute_error(y_test_goals, gb_goals_pred)
gb_goals_rmse = np.sqrt(mean_squared_error(y_test_goals, gb_goals_pred))
gb_goals_r2 = r2_score(y_test_goals, gb_goals_pred)

print(f"  MAE: {gb_goals_mae:.2f}")
print(f"  RMSE: {gb_goals_rmse:.2f}")
print(f"  R¬≤ Score: {gb_goals_r2:.4f}")

print("\n‚úì Goals prediction models trained successfully!")

In [None]:
# Compare goals models
print("\n" + "=" * 70)
print(" GOALS PREDICTION - MODEL COMPARISON")
print("=" * 70)

goals_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge Regression', 'Random Forest', 'Gradient Boosting'],
    'MAE': [lr_goals_mae, ridge_goals_mae, rf_goals_mae, gb_goals_mae],
    'RMSE': [lr_goals_rmse, ridge_goals_rmse, rf_goals_rmse, gb_goals_rmse],
    'R¬≤ Score': [lr_goals_r2, ridge_goals_r2, rf_goals_r2, gb_goals_r2]
}).sort_values('MAE')

print("\n" + goals_comparison.to_string(index=False))

best_goals_model = goals_comparison.iloc[0]['Model']
print(f"\nüèÜ Best Goals Predictor: {best_goals_model}")

## 8. Assists Prediction (Regression)

In [None]:
print("=" * 70)
print(" ASSISTS PREDICTION")
print("=" * 70)

# Target variable
y_assists = df_model['Assists'].copy()

# Train-test split
X_train_assists, X_test_assists, y_train_assists, y_test_assists = train_test_split(
    X, y_assists, test_size=0.2, random_state=RANDOM_STATE
)

# Train Random Forest
rf_assists = RandomForestRegressor(n_estimators=100, max_depth=10,
                                  random_state=RANDOM_STATE, n_jobs=-1)
rf_assists.fit(X_train_assists, y_train_assists)

# Predictions and evaluation
rf_assists_pred = rf_assists.predict(X_test_assists)
rf_assists_mae = mean_absolute_error(y_test_assists, rf_assists_pred)
rf_assists_rmse = np.sqrt(mean_squared_error(y_test_assists, rf_assists_pred))
rf_assists_r2 = r2_score(y_test_assists, rf_assists_pred)

print(f"\nRandom Forest - Assists Prediction:")
print(f"  MAE: {rf_assists_mae:.2f}")
print(f"  RMSE: {rf_assists_rmse:.2f}")
print(f"  R¬≤ Score: {rf_assists_r2:.4f}")

print("\n‚úì Assists prediction model trained successfully!")

## 9. Top Scorer Classification

In [None]:
print("=" * 70)
print(" TOP SCORER CLASSIFICATION (15+ Goals)")
print("=" * 70)

# Target variable
y_top_scorer = df_model['Top_Scorer'].copy()

print(f"\nTarget distribution:")
print(y_top_scorer.value_counts())

# Train-test split
X_train_ts, X_test_ts, y_train_ts, y_test_ts = train_test_split(
    X, y_top_scorer, test_size=0.2, random_state=RANDOM_STATE, stratify=y_top_scorer
)

# Train Random Forest Classifier
rf_top_scorer = RandomForestClassifier(n_estimators=100, max_depth=10,
                                       random_state=RANDOM_STATE, n_jobs=-1)
rf_top_scorer.fit(X_train_ts, y_train_ts)

# Predictions and evaluation
rf_ts_pred = rf_top_scorer.predict(X_test_ts)
rf_ts_acc = accuracy_score(y_test_ts, rf_ts_pred)

print(f"\nRandom Forest - Top Scorer Classification:")
print(f"  Accuracy: {rf_ts_acc:.4f} ({rf_ts_acc*100:.2f}%)")

print(f"\nClassification Report:")
print(classification_report(y_test_ts, rf_ts_pred, target_names=['Not Top Scorer', 'Top Scorer']))

print("\n‚úì Top scorer classification model trained successfully!")

## 10. Top Assister Classification

In [None]:
print("=" * 70)
print(" TOP ASSISTER CLASSIFICATION (10+ Assists)")
print("=" * 70)

# Target variable
y_top_assister = df_model['Top_Assister'].copy()

print(f"\nTarget distribution:")
print(y_top_assister.value_counts())

# Train-test split
X_train_ta, X_test_ta, y_train_ta, y_test_ta = train_test_split(
    X, y_top_assister, test_size=0.2, random_state=RANDOM_STATE, stratify=y_top_assister
)

# Train Random Forest Classifier
rf_top_assister = RandomForestClassifier(n_estimators=100, max_depth=10,
                                         random_state=RANDOM_STATE, n_jobs=-1)
rf_top_assister.fit(X_train_ta, y_train_ta)

# Predictions and evaluation
rf_ta_pred = rf_top_assister.predict(X_test_ta)
rf_ta_acc = accuracy_score(y_test_ta, rf_ta_pred)

print(f"\nRandom Forest - Top Assister Classification:")
print(f"  Accuracy: {rf_ta_acc:.4f} ({rf_ta_acc*100:.2f}%)")

print(f"\nClassification Report:")
print(classification_report(y_test_ta, rf_ta_pred, target_names=['Not Top Assister', 'Top Assister']))

print("\n‚úì Top assister classification model trained successfully!")

## 11. Model Summary and Feature Importance

In [None]:
print("=" * 70)
print(" COMPREHENSIVE MODEL SUMMARY")
print("=" * 70)

# Create summary table
summary = pd.DataFrame({
    'Prediction Task': ['Goals (Regression)', 'Assists (Regression)', 
                       'Top Scorer (15+)', 'Top Assister (10+)'],
    'Best Model': [best_goals_model, 'Random Forest', 'Random Forest', 'Random Forest'],
    'Primary Metric': ['MAE', 'MAE', 'Accuracy', 'Accuracy'],
    'Performance': [
        f"{goals_comparison.iloc[0]['MAE']:.2f}",
        f"{rf_assists_mae:.2f}",
        f"{rf_ts_acc:.3f}",
        f"{rf_ta_acc:.3f}"
    ]
})

print("\n" + summary.to_string(index=False))

print("\n‚úÖ All player stats models trained and evaluated successfully!")

In [None]:
# Feature importance from goals prediction
print("\n" + "=" * 70)
print(" FEATURE IMPORTANCE (Goals Prediction)")
print("=" * 70)

feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_goals.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(feature_importance.head(15).to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'], color='#3498db', alpha=0.8)
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 20 Feature Importances (Goals Prediction)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 12. Sample Predictions

In [None]:
print("=" * 70)
print(" SAMPLE PREDICTIONS")
print("=" * 70)

# Get sample predictions on test set
sample_size = 10
sample_indices = X_test_goals.index[:sample_size]

sample_predictions = pd.DataFrame({
    'Actual Goals': y_test_goals.loc[sample_indices].values,
    'Predicted Goals': np.round(rf_goals_pred[:sample_size], 1),
    'Actual Assists': df_model.loc[sample_indices, 'Assists'].values,
    'Predicted Assists': np.round(rf_assists.predict(X.loc[sample_indices]), 1)
})

sample_predictions['Goals Error'] = np.abs(sample_predictions['Actual Goals'] - sample_predictions['Predicted Goals'])

print("\nSample Predictions (First 10 test players):")
print(sample_predictions.to_string(index=False))

print(f"\n‚úì Predictions generated successfully!")

## 13. Summary and Conclusions

In [None]:
print("=" * 70)
print(" SUMMARY AND CONCLUSIONS")
print("=" * 70)

print("\nüìä Dataset Overview:")
print(f"  - Total players analyzed: {len(df):,}")
print(f"  - Players after filtering (5+ matches): {len(df_model):,}")
print(f"  - Features engineered: {len(feature_cols)}")
print(f"  - Positions covered: {df['Position'].nunique() if 'Position' in df.columns else 'N/A'}")

print("\nüéØ Model Performance Summary:")
print(f"  - Goals Prediction: MAE={goals_comparison.iloc[0]['MAE']:.2f} goals (Best: {best_goals_model})")
print(f"  - Assists Prediction: MAE={rf_assists_mae:.2f} assists")
print(f"  - Top Scorer Classification: {rf_ts_acc*100:.1f}% accuracy")
print(f"  - Top Assister Classification: {rf_ta_acc*100:.1f}% accuracy")

print("\nüí° Key Insights:")
top_3_features = feature_importance.head(3)['feature'].tolist()
print(f"  - Most important features: {', '.join(top_3_features)}")
print(f"  - Average goals per player: {df['Goals'].mean():.2f}")
print(f"  - Average assists per player: {df['Assists'].mean():.2f}")
if 'Position' in df.columns:
    top_position = df.groupby('Position')['Goals'].mean().idxmax()
    print(f"  - Position with most goals: {top_position}")

print("\nüèÜ Applications:")
print("  - Predict golden boot winners")
print("  - Identify top assist providers")
print("  - Find undervalued players (high xG but lower actual goals)")
print("  - Scout players based on predicted performance")

print("\n‚úÖ Notebook execution completed successfully!")
print("\nThese models can now predict EPL player performance metrics.")