<a href="https://colab.research.google.com/github/ANILVATTI56/ScoreSight---Predicting-EPL-Points-Tally-Top-Scorer/blob/main/VATTI_ANIL_KUMAR_REDDY_EPL_SCORE_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



In [None]:
# Load the dataset
file_name = "ENGLISH PREMIER LEAGUE DATASET.csv"
df = pd.read_csv(file_name)


In [None]:
# --- 1. Create Target Variable (for Classification Example) ---
# Create a binary target: 1 if the player scored a goal (Goals > 0), 0 otherwise
df['Scored_Goal'] = (df['Goals'] > 0).astype(int)
y = df['Scored_Goal']

In [None]:

# --- 2. Build a Cleaning & Model Pipeline (Recommended for ML) ---

# Define the imputation and scaling steps
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

# Apply cleaning steps to the numerical features
# Fit the imputer and scaler on X_numerical and transform
X_imputed = imputer.fit_transform(X_numerical)
X_scaled = scaler.fit_transform(X_imputed)

# Convert back to a DataFrame for clarity (optional, but good for inspection)
X_cleaned = pd.DataFrame(X_scaled, columns=X_numerical.columns)

In [None]:
print("--- Cleaned Features (X) Head ---")
print(X_cleaned.head())
print("\n--- Target Variable (y) Head ---")
print(y.head())
print("\n--- Final Data Shape ---")
print(f"Features (X) shape: {X_cleaned.shape}")
print(f"Target (y) shape: {y.shape}")

--- Cleaned Features (X) Head ---
   Jersey Number       Age  Appearances      Wins    Losses  Goals per match  \
0      -1.307265  0.505793    -0.059016 -0.024604 -0.277432        -0.225919   
1       0.952346  0.048094    -0.842122 -0.743747 -0.839557        -0.225919   
2      -0.459911 -0.180755    -0.842122 -0.743747 -0.839557        -0.225919   
3      -1.236652 -0.180755     1.115643  1.567784  0.460357        -0.225919   
4      -1.166039 -0.638453    -0.646345 -0.563961 -0.663893        -0.225919   

   Headed goals  Goals with right foot  Goals with left foot  \
0     -0.382722              -0.292880             -0.353012   
1     -0.382722              -0.292880             -0.353012   
2     -0.382722              -0.292880             -0.353012   
3     -0.382722              -0.004710              0.151333   
4     -0.382722              -0.388936             -0.184897   

   Penalties scored  ...   Punches  High Claims   Catches  Sweeper clearances  \
0         -0.209873

In [None]:
print("--- Cleaned Features (X) Head ---")
print(X_cleaned.head())
print("\n--- Target Variable (y) Head ---")
print(y.head())
print("\n--- Final Data Shape ---")
print(f"Features (X) shape: {X_cleaned.shape}")
print(f"Target (y) shape: {y.shape}")

# Optional: Split data for training the SVM
# X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.2, random_state=42)
# print(f"Train/Test split: {X_train.shape[0]} training samples.")

# The data is now cleaned, imputed, and scaled, ready for the SVM algorithm.

--- Cleaned Features (X) Head ---
   Jersey Number       Age  Appearances      Wins    Losses  Goals per match  \
0      -1.307265  0.505793    -0.059016 -0.024604 -0.277432        -0.225919   
1       0.952346  0.048094    -0.842122 -0.743747 -0.839557        -0.225919   
2      -0.459911 -0.180755    -0.842122 -0.743747 -0.839557        -0.225919   
3      -1.236652 -0.180755     1.115643  1.567784  0.460357        -0.225919   
4      -1.166039 -0.638453    -0.646345 -0.563961 -0.663893        -0.225919   

   Headed goals  Goals with right foot  Goals with left foot  \
0     -0.382722              -0.292880             -0.353012   
1     -0.382722              -0.292880             -0.353012   
2     -0.382722              -0.292880             -0.353012   
3     -0.382722              -0.004710              0.151333   
4     -0.382722              -0.388936             -0.184897   

   Penalties scored  ...   Punches  High Claims   Catches  Sweeper clearances  \
0         -0.209873

In [None]:
# --- 3. Split Data ---
# Split the cleaned data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X_cleaned, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report

# --- Helper Functions for Evaluation ---
def evaluate_regression(y_test, y_pred, model_name):
    """Calculates and prints MAE, RMSE, and R-squared for regression models."""
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"\n--- {model_name} Metrics ---")
    print(f"  MAE: {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R-squared: {r2:.4f}")

def evaluate_classification(y_test, y_pred, model_name):
    """Calculates and prints Accuracy and Classification Report for classification models."""
    acc = accuracy_score(y_test, y_pred)
    print(f"\n--- {model_name} Metrics ---")
    print(f"  Accuracy: {acc:.4f}")
    print("  Classification Report:")
    # Using zero_division=0 to handle potential warnings if a class has no predicted samples
    print(classification_report(y_test, y_pred, zero_division=0))

# --- 0. Data Loading and Preprocessing ---
file_name = "ENGLISH PREMIER LEAGUE DATASET.csv"
df = pd.read_csv(file_name)

# 0.1 Clean Percentage Columns (e.g., '16%' to 0.16)
percent_cols = ['Shooting accuracy %', 'Tackle success %', 'Cross accuracy %']
for col in percent_cols:
    if col in df.columns and df[col].dtype == 'object':
        df[col] = df[col].astype(str).str.rstrip('%').replace('nan', np.nan).astype(float).div(100)

# 0.2 Feature Selection (drop text/identifier columns, keeping only numerical features)
X_data = df.drop(columns=['Name', 'Club', 'Position', 'Nationality', 'Jersey Number', 'Goals', 'Appearances', 'Wins', 'Losses'], errors='ignore')

# 0.3 Select Numerical Features and Handle Missing Values/Scaling
X_numerical = X_data.select_dtypes(include=np.number)
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

X_imputed = imputer.fit_transform(X_numerical)
X_scaled = scaler.fit_transform(X_imputed)
X_cleaned = pd.DataFrame(X_scaled, columns=X_numerical.columns)


# ----------------------------------------------------------------------
# --- 1. Linear Regression (Baseline Regression Model for Player Performance) ---
# ----------------------------------------------------------------------
print("--- 1. Linear Regression (Baseline Regression) ---")
# Target: 'Goals' (Player Performance Metric)
y_reg = df['Goals']

# Train/Test Split
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_cleaned, y_reg, test_size=0.2, random_state=42
)

# Model Training
lr = LinearRegression()
lr.fit(X_reg_train, y_reg_train)

# Prediction
y_lr_pred = lr.predict(X_reg_test)

# Evaluation
evaluate_regression(y_reg_test, y_lr_pred, "Linear Regression")


# ----------------------------------------------------------------------
# --- 2. Logistic Regression (Baseline Classification Model for Match Outcomes) ---
# ----------------------------------------------------------------------
print("\n--- 2. Logistic Regression (Baseline Classification) ---")

# Target Creation: 'High_Win_Rate_Player' (simulating binary match outcome: above/below median win rate)
df['Win_Rate'] = df['Wins'] / df['Appearances'].replace(0, 1) # Calculate Win Rate
median_win_rate = df['Win_Rate'].median()
df['High_Win_Rate_Player'] = (df['Win_Rate'] >= median_win_rate).astype(int)
y_cls = df['High_Win_Rate_Player']

# Train/Test Split (Stratified to maintain class balance)
X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(
    X_cleaned, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

# Model Training
log_reg = LogisticRegression(solver='liblinear', random_state=42)
log_reg.fit(X_cls_train, y_cls_train)

# Prediction
y_log_pred = log_reg.predict(X_cls_test)

# Evaluation
evaluate_classification(y_cls_test, y_log_pred, "Logistic Regression")

--- 1. Linear Regression (Baseline Regression) ---

--- Linear Regression Metrics ---
  MAE: 0.2237
  RMSE: 0.3266
  R-squared: 0.9993

--- 2. Logistic Regression (Baseline Classification) ---

--- Logistic Regression Metrics ---
  Accuracy: 0.7478
  Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.75      0.75        57
           1       0.75      0.74      0.75        58

    accuracy                           0.75       115
   macro avg       0.75      0.75      0.75       115
weighted avg       0.75      0.75      0.75       115



In [None]:
print("--- Cleaned Features (X) Head ---")
print(X_cleaned.head())
print("\n--- Target Variable (y) Head ---")
print(y.head())
print("\n--- Final Data Shape ---")
print(f"Features (X) shape: {X_cleaned.shape}")
print(f"Target (y) shape: {y.shape}")

--- Cleaned Features (X) Head ---
        Age  Goals per match  Headed goals  Goals with right foot  \
0  0.505793        -0.225919     -0.382722              -0.292880   
1  0.048094        -0.225919     -0.382722              -0.292880   
2 -0.180755        -0.225919     -0.382722              -0.292880   
3 -0.180755        -0.225919     -0.382722              -0.004710   
4 -0.638453        -0.225919     -0.382722              -0.388936   

   Goals with left foot  Penalties scored  Freekicks scored     Shots  \
0             -0.353012         -0.209873         -0.212612 -0.300435   
1             -0.353012         -0.209873         -0.212612 -0.300435   
2             -0.353012         -0.209873         -0.212612 -0.300435   
3              0.151333         -0.209873         -0.212612 -0.300435   
4             -0.184897         -0.209873         -0.212612 -0.300435   

   Shots on target  Shooting accuracy %  ...   Punches  High Claims   Catches  \
0        -0.306283             

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, classification_report

# --- Helper Functions (as defined in previous steps) ---

def evaluate_regression(y_test, y_pred, model_name):
    """Calculates and prints MAE, RMSE, and R-squared for regression models."""
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"\n--- {model_name} Metrics (MAE, RMSE, RÂ²) ---")
    print(f"  MAE: {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R-squared: {r2:.4f}")

def evaluate_classification(y_test, y_pred, model_name):
    """Calculates and prints Accuracy, Precision, Recall, and F1-Score for classification models."""
    acc = accuracy_score(y_test, y_pred)
    print(f"\n--- {model_name} Metrics (Accuracy, Precision, Recall, F1) ---")
    print(f"  Accuracy: {acc:.4f}")
    print("  Classification Report:")
    # Classification Report includes Precision, Recall, and F1-Score
    print(classification_report(y_test, y_pred, zero_division=0))

In [None]:

# . Data Loading and Preprocessing ---
file_name = "ENGLISH PREMIER LEAGUE DATASET.csv"
df = pd.read_csv(file_name)

# Clean Percentage Columns
percent_cols = ['Shooting accuracy %', 'Tackle success %', 'Cross accuracy %']
for col in percent_cols:
    if col in df.columns and df[col].dtype == 'object':
        df[col] = df[col].astype(str).str.rstrip('%').replace('nan', np.nan).astype(float).div(100)

#  Feature Selection
X_data = df.drop(columns=['Name', 'Club', 'Position', 'Nationality', 'Jersey Number', 'Goals', 'Appearances', 'Wins', 'Losses'], errors='ignore')

# Select Numerical Features and Handle Missing Values/Scaling
X_numerical = X_data.select_dtypes(include=np.number)
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
X_imputed = imputer.fit_transform(X_numerical)
X_scaled = scaler.fit_transform(X_imputed)
X_cleaned = pd.DataFrame(X_scaled, columns=X_numerical.columns)

# --- 1. Regression Model Training (Random Forest) ---
y_reg = df['Goals']
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_cleaned, y_reg, test_size=0.2, random_state=42
)
rfr = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rfr.fit(X_reg_train, y_reg_train)
y_rfr_pred = rfr.predict(X_reg_test)

# --- 2. Classification Model Training (Logistic Regression) ---
df['Win_Rate'] = df['Wins'] / df['Appearances'].replace(0, 1)
median_win_rate = df['Win_Rate'].median()
df['High_Win_Rate_Player'] = (df['Win_Rate'] >= median_win_rate).astype(int)
y_cls = df['High_Win_Rate_Player']

X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(
    X_cleaned, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)
log_reg = LogisticRegression(solver='liblinear', random_state=42)
log_reg.fit(X_cls_train, y_cls_train)
y_log_pred = log_reg.predict(X_cls_test)


# ----------------------------------------------------------------------
# --- Visualization Code ---
# ----------------------------------------------------------------------

# 1. Actual vs Predicted Scatter Plot (Regression)
plt.figure(figsize=(8, 6))
plt.scatter(y_reg_test, y_rfr_pred, alpha=0.6, color='skyblue')
plt.plot([y_reg_test.min(), y_reg_test.max()], [y_reg_test.min(), y_reg_test.max()], 'r--', lw=2, label='Perfect Prediction')
plt.xlabel("Actual Goals")
plt.ylabel("Predicted Goals (Random Forest)")
plt.title("Regression: Actual vs Predicted Goals")
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig("regression_actual_vs_predicted_scatter.png")
plt.close()


# 2. Confusion Matrix for Match Outcomes (Classification)
cm = confusion_matrix(y_cls_test, y_log_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Low Win Rate (0)', 'High Win Rate (1)'],
            yticklabels=['Low Win Rate (0)', 'High Win Rate (1)'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Classification: Confusion Matrix (Logistic Regression)')
plt.savefig("classification_confusion_matrix_logreg.png")
plt.close()