In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [24]:
df2 = pd.read_csv("Top_Goals_CSV.csv")


In [26]:
print(df2.columns)

Index(['Season', 'Rank', 'Player', 'Club', 'Goals', 'IsTop10', 'Position',
       'Age', 'Appearances', 'Goals_prev_season', 'Assists', 'Penalty_Goals',
       'Non-Penalty_Goals', 'Goals_per_90', 'Big_6_Club_Feature',
       'Club_League_Rank', 'Club_Total_Goals', 'League_Goals_per_Match',
       'Games_in_Season'],
      dtype='object')


In [27]:
df2=df2.drop(['Season', 'Rank', 'Player', 'Club','Assists','IsTop10','Club_League_Rank', 'Club_Total_Goals','Games_in_Season'],axis=1)
print(df2.head())

   Goals              Position  Age  Appearances  Goals_prev_season  \
0     27               Forward   23           31               36.0   
1     22  Attacking Midfielder   22           33                3.0   
2     21               Forward   24           30               10.0   
3     19               Forward   28           37               15.0   
4     19               Forward   26           38                6.0   

   Penalty_Goals  Non-Penalty_Goals  Goals_per_90  Big_6_Club_Feature  \
0            1.0                 26          0.85                 1.0   
1            9.0                 13          0.61                 1.0   
2            5.0                 16          0.76                 0.0   
3            0.0                 19          0.51                 0.0   
4            1.0                 18          0.50                 0.0   

   League_Goals_per_Match  
0                    2.83  
1                    2.83  
2                    2.83  
3                    2

In [5]:
# Remove duplicates
print("Duplicates before:", df2.duplicated().sum())
df2 = df2.drop_duplicates()
print("Duplicates after:", df2.duplicated().sum())


Duplicates before: 0
Duplicates after: 0


In [6]:
# Missing values check
print("Missing values:\n", df2.isnull().sum())


Missing values:
 Rank                        0
Player                      0
Club                        0
Goals                       0
IsTop10                     0
Position                    0
Age                         0
Appearances                 0
Goals_prev_season         115
Assists                   228
Penalty_Goals               1
Non-Penalty_Goals           0
Goals_per_90                0
Big_6_Club_Feature          0
Club_League_Rank            0
Club_Total_Goals            0
League_Goals_per_Match      0
Games_in_Season             0
dtype: int64


In [28]:
#One hot encoding
df = pd.get_dummies(df2, columns=['Position'], prefix='Pos', dtype=int)
df.head(5)

Unnamed: 0,Goals,Age,Appearances,Goals_prev_season,Penalty_Goals,Non-Penalty_Goals,Goals_per_90,Big_6_Club_Feature,League_Goals_per_Match,Pos_Attacking Midfielder,Pos_Forward,Pos_Midfielder,Pos_Winger
0,27,23,31,36.0,1.0,26,0.85,1.0,2.83,0,1,0,0
1,22,22,33,3.0,9.0,13,0.61,1.0,2.83,1,0,0,0
2,21,24,30,10.0,5.0,16,0.76,0.0,2.83,0,1,0,0
3,19,28,37,15.0,0.0,19,0.51,0.0,2.83,0,1,0,0
4,19,26,38,6.0,1.0,18,0.5,0.0,2.83,0,1,0,0


In [34]:
#Label Encoding
df1 = df2.copy()

print("Columns:", df1.columns)

le = LabelEncoder()
df1["Position_encoded"] = le.fit_transform(df1["Position"])

print(df1.head())

print(df1.info())

Columns: Index(['Goals', 'Position', 'Age', 'Appearances', 'Goals_prev_season',
       'Penalty_Goals', 'Non-Penalty_Goals', 'Goals_per_90',
       'Big_6_Club_Feature', 'League_Goals_per_Match'],
      dtype='object')
   Goals              Position  Age  Appearances  Goals_prev_season  \
0     27               Forward   23           31               36.0   
1     22  Attacking Midfielder   22           33                3.0   
2     21               Forward   24           30               10.0   
3     19               Forward   28           37               15.0   
4     19               Forward   26           38                6.0   

   Penalty_Goals  Non-Penalty_Goals  Goals_per_90  Big_6_Club_Feature  \
0            1.0                 26          0.85                 1.0   
1            9.0                 13          0.61                 1.0   
2            5.0                 16          0.76                 0.0   
3            0.0                 19          0.51             

In [40]:
X = df.drop("Goals", axis=1)
y = df["Goals"]

In [41]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)


In [43]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [46]:
# Make predictions
y_pred = model.predict(X_test)

In [47]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("MAE:", mae)
print("R2 Score:", r2)

MSE: 0.35011999999999993
MAE: 0.2858461538461537
R2 Score: 0.982162582901242
