# 📘 ScoreSight - Top Goal Scorer Prediction (EDA & Preliminary Modeling)
Author: [Your Name]
Internship: Infosys
Goal: Predict total goals scored by a player in a season (regression)

In [None]:
!pip install seaborn xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

sns.set(style="whitegrid", font_scale=1.05)

In [None]:
df = pd.read_csv("topgoals.csv")
print("Shape:", df.shape)
display(df.head())

In [None]:
print("Columns:", df.columns.tolist())
print("\nDtypes:")
print(df.dtypes)
print("\nMissing values (count):")
print(df.isnull().sum())

In [None]:
drop_cols = [
    "Rank", "Player", "Club", "Season", "Games_in_Season",
    "Club_League_Rank", "Club_Total_Goals", "Penalty_Goals", "Non-Penalty_Goals"
]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])
print("Remaining columns:", df.columns.tolist())

In [None]:
for col in df.columns:
    if df[col].dtype == 'O':
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].mode().iloc[0])
    else:
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].median())

print("Missing values after imputation:")
print(df.isnull().sum())

In [None]:
target = "Goals"
if target not in df.columns:
    raise ValueError("Target column 'Goals' not found in dataset.")

candidate_features = [
    "Age", "Position", "Appearances", "Minutes_Played",
    "Goals_prev_season", "Goals_last_3_seasons_avg",
    "Assists", "Goals_per_90", "Big_6_Club_Feature", "League_Goals_per_Match"
]
features = [c for c in candidate_features if c in df.columns]
X = df[features].copy()
y = df[target].copy()

print("Features used:", features)
print("Target stats:")
print(y.describe())

In [None]:
cat_cols = [c for c in ["Position"] if c in X.columns]
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)
print("Shape after encoding:", X_encoded.shape)
display(X_encoded.head())

In [None]:
def detect_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lb = Q1 - 1.5 * IQR
    ub = Q3 + 1.5 * IQR
    return ((series < lb) | (series > ub)).sum()

numeric_cols = X_encoded.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_cols)
for col in numeric_cols:
    print(f"{col}: {detect_outliers_iqr(X_encoded[col])} outliers")

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(y, bins=20, kde=True, color="steelblue")
plt.title("Distribution of Goals per Player (season)")
plt.xlabel("Goals")
plt.ylabel("Count")
plt.show()

In [None]:
if "Minutes_Played" in df.columns:
    plt.figure(figsize=(8,5))
    sns.scatterplot(x=df["Minutes_Played"], y=y, alpha=0.7)
    plt.title("Minutes Played vs Goals")
    plt.xlabel("Minutes Played")
    plt.ylabel("Goals")
    plt.show()

In [None]:
if "Goals_per_90" in df.columns:
    plt.figure(figsize=(8,5))
    sns.scatterplot(x=df["Goals_per_90"], y=y, alpha=0.7, color="darkorange")
    plt.title("Goals_per_90 vs Goals")
    plt.xlabel("Goals_per_90")
    plt.ylabel("Goals")
    plt.show()

In [None]:
if "Position" in df.columns:
    n_pos = df["Position"].nunique()
    plt.figure(figsize=(8,5))
    sns.boxplot(x=df["Position"], y=y, palette=sns.color_palette("Set2", n_pos))
    plt.title("Goals by Position")
    plt.xlabel("Position")
    plt.ylabel("Goals")
    plt.show()

In [None]:
plt.figure(figsize=(12,10))
corr_df = X_encoded.join(y).corr()
sns.heatmap(corr_df, cmap="coolwarm", center=0, annot=True, fmt=".2f", cbar_kws={'shrink':0.6})
plt.title("Correlation matrix (features + Goals)")
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])

print("Scaled numeric columns:", num_cols)

In [None]:
def regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}

y_pred_baseline = np.full(shape=len(y_test), fill_value=y_train.mean())
print("Baseline metrics:", regression_metrics(y_test, y_pred_baseline))

In [None]:
def evaluate_regressor(model, X_train, y_train, X_test, y_test, use_scaled=False):
    Xtr = X_train_scaled if use_scaled else X_train
    Xte = X_test_scaled if use_scaled else X_test
    model.fit(Xtr, y_train)
    preds = model.predict(Xte)
    metrics = regression_metrics(y_test, preds)
    print(f"\nModel: {model.__class__.__name__}")
    for k,v in metrics.items():
        print(f"{k}: {v:.4f}")
    return metrics

In [None]:
lin_reg = LinearRegression()
metrics_lin = evaluate_regressor(lin_reg, X_train, y_train, X_test, y_test, use_scaled=True)

In [None]:
rf = RandomForestRegressor(n_estimators=200, random_state=42)
metrics_rf = evaluate_regressor(rf, X_train, y_train, X_test, y_test, use_scaled=False)

In [None]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=42, n_estimators=200)
metrics_xgb = evaluate_regressor(xgb, X_train, y_train, X_test, y_test, use_scaled=False)

In [None]:
results = pd.DataFrame({
    "LinearRegression": metrics_lin,
    "RandomForest": metrics_rf,
    "XGBoost": metrics_xgb
}).T
display(results)

In [None]:
if hasattr(rf, "feature_importances_"):
    fi = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    plt.figure(figsize=(8,6))
    sns.barplot(x=fi.values[:15], y=fi.index[:15], palette="viridis")
    plt.title("Top 15 Feature Importances (Random Forest)")
    plt.show()

In [None]:
print("Notebook End - models trained and baseline evaluations complete.")