In [108]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Reading dataset and performing EDA

In [109]:
# Load the dataset from Excel file
df = pd.read_excel("Top_Goals_Excel.xlsx")

In [110]:
# Display the first 5 rows of the dataset to get an overview
df.head()

Unnamed: 0,Season,Rank,Player,Club,Goals,IsTop10,Position,Age,Appearances,Goals_prev_season,Assists,Penalty_Goals,Non-Penalty_Goals,Goals_per_90,Big_6_Club_Feature,Club_League_Rank,Club_Total_Goals,League_Goals_per_Match,Games_in_Season
0,2023-24,1,Erling Haaland,Manchester City,27,1,Forward,23,31,36.0,6.0,1.0,26,0.85,1.0,1,96,2.83,38.0
1,2023-24,2,Cole Palmer,Chelsea,22,1,Attacking Midfielder,22,33,3.0,11.0,9.0,13,0.61,1.0,6,77,2.83,38.0
2,2023-24,3,Alexander Isak,Newcastle United,21,1,Forward,24,30,10.0,2.0,5.0,16,0.76,0.0,7,85,2.83,38.0
3,2023-24,4,Ollie Watkins,Aston Villa,19,1,Forward,28,37,15.0,13.0,0.0,19,0.51,0.0,4,76,2.83,38.0
4,2023-24,4,Dominic Solanke,AFC Bournemouth,19,1,Forward,26,38,6.0,3.0,1.0,18,0.5,0.0,12,54,2.83,38.0


# Storing cleaned dataset in a new csv file

In [111]:
# Save the dataset to a CSV file (backup in CSV format without index column)
df.to_csv("Top_Goals_CSV.csv", index=False)

In [112]:
# Print all column names before cleaning
print(df.columns)

Index(['Season', 'Rank', 'Player', 'Club', 'Goals', 'IsTop10', 'Position',
       'Age', 'Appearances', 'Goals_prev_season', 'Assists', 'Penalty_Goals',
       'Non-Penalty_Goals', 'Goals_per_90', 'Big_6_Club_Feature',
       'Club_League_Rank', 'Club_Total_Goals', 'League_Goals_per_Match',
       'Games_in_Season'],
      dtype='object')


In [113]:
# Drop unnecessary or irrelevant columns that are not useful for analysis
df=df.drop(['Season', 'Rank', 'Player', 'Club','IsTop10','Club_League_Rank', 'Club_Total_Goals','Games_in_Season'],axis=1)

In [114]:
# Print remaining column names after dropping
print(df.columns)

Index(['Goals', 'Position', 'Age', 'Appearances', 'Goals_prev_season',
       'Assists', 'Penalty_Goals', 'Non-Penalty_Goals', 'Goals_per_90',
       'Big_6_Club_Feature', 'League_Goals_per_Match'],
      dtype='object')


In [115]:
#Display first 5 rows after dropping columns
df.head()

Unnamed: 0,Goals,Position,Age,Appearances,Goals_prev_season,Assists,Penalty_Goals,Non-Penalty_Goals,Goals_per_90,Big_6_Club_Feature,League_Goals_per_Match
0,27,Forward,23,31,36.0,6.0,1.0,26,0.85,1.0,2.83
1,22,Attacking Midfielder,22,33,3.0,11.0,9.0,13,0.61,1.0,2.83
2,21,Forward,24,30,10.0,2.0,5.0,16,0.76,0.0,2.83
3,19,Forward,28,37,15.0,13.0,0.0,19,0.51,0.0,2.83
4,19,Forward,26,38,6.0,3.0,1.0,18,0.5,0.0,2.83


In [116]:
# Check for missing (NaN) values in each column
print(df.isnull().sum())

Goals                       0
Position                    0
Age                         0
Appearances                 0
Goals_prev_season         115
Assists                   228
Penalty_Goals               1
Non-Penalty_Goals           0
Goals_per_90                0
Big_6_Club_Feature          0
League_Goals_per_Match      0
dtype: int64


In [117]:
# Check how many duplicate rows are present
print(df.duplicated().sum())

0


In [118]:
# Fill missing values in specific columns with default values:
# - "Goals_prev_season" → replace NaN with 0
# - "Assists" → replace NaN with 0
# - "Penalty_Goals" → replace NaN with 0
df.fillna({
    "Goals_prev_season": 0,
    "Assists": 0,
    "Penalty_Goals": 0
}, inplace=True)

In [119]:
# Remove duplicate rows to avoid redundancy
df = df.drop_duplicates()

In [120]:
# Re-check for missing values after cleaning
print(df.isnull().sum())

Goals                     0
Position                  0
Age                       0
Appearances               0
Goals_prev_season         0
Assists                   0
Penalty_Goals             0
Non-Penalty_Goals         0
Goals_per_90              0
Big_6_Club_Feature        0
League_Goals_per_Match    0
dtype: int64


In [121]:
# Re-check for duplicate rows after cleaning
print(df.duplicated().sum())

0


In [122]:
# Save the cleaned dataset into a new CSV file
df.to_csv("cleaned_Top_Goals.csv", index=False)

# Loading Cleaned dataset

In [123]:
#Load the cleaned dataset
df = pd.read_csv("cleaned_Top_Goals.csv")

In [124]:
#Display Cleaned Dataset
df

Unnamed: 0,Goals,Position,Age,Appearances,Goals_prev_season,Assists,Penalty_Goals,Non-Penalty_Goals,Goals_per_90,Big_6_Club_Feature,League_Goals_per_Match
0,27,Forward,23,31,36.0,6.0,1.0,26,0.85,1.0,2.83
1,22,Attacking Midfielder,22,33,3.0,11.0,9.0,13,0.61,1.0,2.83
2,21,Forward,24,30,10.0,2.0,5.0,16,0.76,0.0,2.83
3,19,Forward,28,37,15.0,13.0,0.0,19,0.51,0.0,2.83
4,19,Forward,26,38,6.0,3.0,1.0,18,0.50,0.0,2.83
...,...,...,...,...,...,...,...,...,...,...,...
319,15,Forward,25,41,0.0,0.0,0.0,15,0.37,0.0,2.58
320,15,Forward,29,36,0.0,0.0,0.0,15,0.42,1.0,2.58
321,15,Attacking Midfielder,24,40,0.0,0.0,0.0,15,0.38,0.0,2.58
322,15,Forward,23,42,0.0,0.0,0.0,15,0.36,0.0,2.58


In [125]:
df.columns

Index(['Goals', 'Position', 'Age', 'Appearances', 'Goals_prev_season',
       'Assists', 'Penalty_Goals', 'Non-Penalty_Goals', 'Goals_per_90',
       'Big_6_Club_Feature', 'League_Goals_per_Match'],
      dtype='object')

# Label Encoding

In [126]:
# Identify categorical columns
# In this dataset, 'Position' is categorical.
categorical_cols = ['Position']

In [127]:
# Apply Label Encoding
# LabelEncoder assigns each unique category an integer.
# Example: Forward=0, Attacking Midfielder=1, Winger=2 ...
# NOTE: The mapping depends on alphabetical order of categories.
# ---------------------------------------------
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])   # replace original col with numeric codes
    label_encoders[col] = le              # store encoder for future inverse transform


In [128]:
# STEP 3: Verify dataset after encoding
# Now all categorical columns are converted into numeric form.
print(df.head())

   Goals  Position  Age  Appearances  Goals_prev_season  Assists  \
0     27         1   23           31               36.0      6.0   
1     22         0   22           33                3.0     11.0   
2     21         1   24           30               10.0      2.0   
3     19         1   28           37               15.0     13.0   
4     19         1   26           38                6.0      3.0   

   Penalty_Goals  Non-Penalty_Goals  Goals_per_90  Big_6_Club_Feature  \
0            1.0                 26          0.85                 1.0   
1            9.0                 13          0.61                 1.0   
2            5.0                 16          0.76                 0.0   
3            0.0                 19          0.51                 0.0   
4            1.0                 18          0.50                 0.0   

   League_Goals_per_Match  
0                    2.83  
1                    2.83  
2                    2.83  
3                    2.83  
4           

# Model Training

In [129]:
# Split features (X) and target (y)
# we want to predict 'Goals' (target) using other features.

X = df.drop("Goals", axis=1)   # all features except 'Goals'
y = df["Goals"]                # target column

In [130]:
print(X)

     Position  Age  Appearances  Goals_prev_season  Assists  Penalty_Goals  \
0           1   23           31               36.0      6.0            1.0   
1           0   22           33                3.0     11.0            9.0   
2           1   24           30               10.0      2.0            5.0   
3           1   28           37               15.0     13.0            0.0   
4           1   26           38                6.0      3.0            1.0   
..        ...  ...          ...                ...      ...            ...   
319         1   25           41                0.0      0.0            0.0   
320         1   29           36                0.0      0.0            0.0   
321         0   24           40                0.0      0.0            0.0   
322         1   23           42                0.0      0.0            0.0   
323         1   29           34                0.0      0.0            0.0   

     Non-Penalty_Goals  Goals_per_90  Big_6_Club_Feature  \
0  

In [131]:
print(y)

0      27
1      22
2      21
3      19
4      19
       ..
319    15
320    15
321    15
322    15
323    15
Name: Goals, Length: 324, dtype: int64


# Splitting data into train and test sets

In [132]:
# Train-test split
# Split into training (80%) and testing (20%)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)


# Training Model using RandomForestRegressor

In [133]:
# Train models
# Since label encoding was applied, we can use tree-based models
# (Random Forest, Decision Tree, Gradient Boosting, etc.)

# Example: Random Forest for regression (predicting Goals count)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make Predictions

In [134]:
# Make predictions
y_pred = model.predict(X_test)

# Performance Evaluation (Regression Metrics)

In [135]:
# Evaluate Model (Regression Metrics)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

Mean Absolute Error (MAE): 0.3343076923076921
Mean Squared Error (MSE): 0.4136353846153844
Root Mean Squared Error (RMSE): 0.6431449172740031
R² Score: 0.978926691185337
