# 📘 ScoreSight - Top Goal Scorer Prediction (EDA & Preprocessing)
Author: [Your Name]  
Internship: Infosys  
Goal: Predict total goals scored by a player in a season (Regression)

In [None]:
!pip install seaborn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Set visualization style
sns.set(style="whitegrid", palette="pastel", font_scale=1.1)

In [None]:
# Load Dataset
df = pd.read_csv("topgoals.csv")
print("Shape of dataset:", df.shape)
display(df.head())

In [None]:
# Data Cleaning - Drop leakage/unnecessary columns
drop_cols = [
    "Rank", "Player", "Club", "Season", "Games_in_Season",
    "Club_League_Rank", "Club_Total_Goals", "Penalty_Goals", "Non-Penalty_Goals"
]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

print("Remaining Columns:", df.columns.tolist())
print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
# Handle missing values (fixed - avoid inplace warnings)
for col in df.columns:
    if df[col].dtype != 'object':
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

print("Missing values after imputation:")
print(df.isnull().sum())

In [None]:
# Target & Feature Split
target = "Goals"
X = df.drop(columns=[target])
y = df[target]

print("Target variable (Goals) stats:")
print(y.describe())

In [None]:
# One-Hot Encode categorical column 'Position'
X_encoded = pd.get_dummies(X, columns=["Position"], drop_first=True)
print("Shape after encoding:", X_encoded.shape)

In [None]:
# Outlier Detection (IQR Method)
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

numeric_cols = X_encoded.select_dtypes(include=np.number).columns
for col in list(numeric_cols)[:5]:
    print(f"{col}: {len(detect_outliers_iqr(X_encoded, col))} outliers")

In [None]:
# EDA - Distribution of Goals
plt.figure(figsize=(6,4))
sns.histplot(y, bins=20, kde=True, color="steelblue")
plt.title("Distribution of Goals per Player")
plt.show()

In [None]:
# EDA - Minutes Played vs Goals
plt.figure(figsize=(8,5))
sns.scatterplot(x=df["Minutes_Played"], y=y, hue=df["Position"], alpha=0.7)
plt.title("Minutes Played vs Goals")
plt.show()

In [None]:
# EDA - Age vs Goals
plt.figure(figsize=(8,5))
sns.scatterplot(x=df["Age"], y=y, alpha=0.7, color="darkorange")
plt.title("Age vs Goals")
plt.show()

In [None]:
# EDA - Position vs Goals
plt.figure(figsize=(7,5))
sns.boxplot(x=df["Position"], y=y, palette="Set2")
plt.title("Goals by Position")
plt.show()

In [None]:
# EDA - Big 6 Club vs Goals
plt.figure(figsize=(7,5))
sns.boxplot(x=df["Big_6_Club_Feature"], y=y, palette="muted")
plt.title("Goals by Big 6 vs Non-Big 6 Clubs")
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(12,8))
corr = X_encoded.join(y).corr()
sns.heatmap(corr, cmap="coolwarm", center=0, cbar_kws={'shrink':0.5})
plt.title("Correlation Heatmap of Features with Goals")
plt.show()

In [None]:
# Train-Test Split (Setup)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
# Evaluation Template (Dummy Baseline)
# Baseline: predict mean goals for all players
y_pred_dummy = np.full_like(y_test, y.mean(), dtype=float)

mae = mean_absolute_error(y_test, y_pred_dummy)
mse = mean_squared_error(y_test, y_pred_dummy)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_dummy)

print("Baseline Evaluation Metrics:")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")