In [2]:
import pandas as pd
import matplotlib as plt
import numpy as np

In [3]:
data = pd.read_csv('premier-league-matches.csv')

In [4]:
print(type(data))

<class 'pandas.core.frame.DataFrame'>


In [5]:
data.head()

Unnamed: 0,Season_End_Year,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR
0,1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H
1,1993,1,1992-08-15,Leeds United,2,1,Wimbledon,H
2,1993,1,1992-08-15,Sheffield Utd,2,1,Manchester Utd,H
3,1993,1,1992-08-15,Crystal Palace,3,3,Blackburn,D
4,1993,1,1992-08-15,Arsenal,2,4,Norwich City,A


In [6]:
popped_items = ['Season_End_Year', 'Wk', 'Date','FTR']
for i in popped_items:
    data.pop(i)

In [7]:
data.head()

Unnamed: 0,Home,HomeGoals,AwayGoals,Away
0,Coventry City,2,1,Middlesbrough
1,Leeds United,2,1,Wimbledon
2,Sheffield Utd,2,1,Manchester Utd
3,Crystal Palace,3,3,Blackburn
4,Arsenal,2,4,Norwich City


In [8]:
#rearranging order of the dataframe
data = data[['Home', 'Away', 'HomeGoals', 'AwayGoals']]

In [9]:
data.head()

Unnamed: 0,Home,Away,HomeGoals,AwayGoals
0,Coventry City,Middlesbrough,2,1
1,Leeds United,Wimbledon,2,1
2,Sheffield Utd,Manchester Utd,2,1
3,Crystal Palace,Blackburn,3,3
4,Arsenal,Norwich City,2,4


In [10]:
missingValues = data.isnull().sum()

print(missingValues)

Home         0
Away         0
HomeGoals    0
AwayGoals    0
dtype: int64


In [11]:
X = data.iloc[:,:-2].values
y = data.iloc[:,-2:].values

In [12]:
print(X)

[['Coventry City' 'Middlesbrough']
 ['Leeds United' 'Wimbledon']
 ['Sheffield Utd' 'Manchester Utd']
 ...
 ['Aston Villa' 'Brighton']
 ['Leeds United' 'Tottenham']
 ['Brentford' 'Manchester City']]


In [13]:
print(y)

[[2 1]
 [2 1]
 [2 1]
 ...
 [2 1]
 [1 4]
 [1 0]]


In [14]:
#Categorical encoding of home and away
from sklearn.preprocessing import LabelEncoder
le_home = LabelEncoder()
le_away = LabelEncoder()


In [15]:
X[:,0] = le_home.fit_transform(X[:,0])

In [16]:
X[:,1] = le_away.fit_transform(X[:,1])

In [17]:
#verifying
print(X[:5])

[[15 28]
 [23 48]
 [36 27]
 [16 4]
 [0 30]]


In [18]:
#splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
y_train_home, y_train_away = y_train[:, 0], y_train[:, 1]
y_test_home, y_test_away = y_test[:, 0], y_test[:, 1]

In [20]:
from sklearn.metrics import mean_squared_error
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return rmse


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, PoissonRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

models = [
    LinearRegression(),
    RandomForestRegressor(n_estimators=100, random_state=42),
    GradientBoostingRegressor(n_estimators=100, random_state=42),
    SVR(kernel='rbf'),
    MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
    KNeighborsRegressor(n_neighbors=5),
    PoissonRegressor()
]

print("Home Score Predictions:")
for model in models:
    rmse = evaluate_model(model, X_train, X_test, y_train_home, y_test_home)
    print(f"{type(model).__name__} RMSE: {rmse}")

print("\nAway Score Predictions:")
for model in models:
    rmse = evaluate_model(model, X_train, X_test, y_train_away, y_test_away)
    print(f"{type(model).__name__} RMSE: {rmse}")

Home Score Predictions:
LinearRegression RMSE: 1.2704108321418641
RandomForestRegressor RMSE: 1.3222073243006434
GradientBoostingRegressor RMSE: 1.2278582894042795
SVR RMSE: 1.3032252995503752
MLPRegressor RMSE: 1.2905312482025002
KNeighborsRegressor RMSE: 1.3230421630671991
PoissonRegressor RMSE: 1.2704535571121718

Away Score Predictions:
LinearRegression RMSE: 1.1087642519476595
RandomForestRegressor RMSE: 1.1660923047997143
GradientBoostingRegressor RMSE: 1.0768757553912722
SVR RMSE: 1.1172075449724903
MLPRegressor RMSE: 1.1474148253005887
KNeighborsRegressor RMSE: 1.18209839204645
PoissonRegressor RMSE: 1.1087557739238638


In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Assuming you have your true values (y_test) and predictions (y_pred) for both home and away scores

def evaluate_predictions(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    # Custom accuracy for rounded predictions
    rounded_true = np.round(y_true)
    rounded_pred = np.round(y_pred)
    exact_match = np.mean(rounded_true == rounded_pred)
    
    within_one = np.mean(np.abs(rounded_true - rounded_pred) <= 1)
    
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"R-squared Score: {r2:.2f}")
    print(f"Exact Match Accuracy: {exact_match:.2%}")
    print(f"Within One Goal Accuracy: {within_one:.2%}")

# Make predictions on the test set
y_pred_home = home_model.predict(X_test)
y_pred_away = away_model.predict(X_test)

print("Home Score Predictions:")
evaluate_predictions(y_test[:, 0], y_pred_home)

print("\nAway Score Predictions:")
evaluate_predictions(y_test[:, 1], y_pred_away)

NotFittedError: This GradientBoostingRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [24]:
from sklearn.ensemble import GradientBoostingRegressor

# Create and fit GradientBoostingRegressor models for home and away scores
home_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
away_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

home_model.fit(X, y[:, 0])
away_model.fit(X, y[:, 1])

# Example prediction
home_team = "Chelsea"
away_team = "Manchester City"

# Encode the team names
home_encoded = le_home.transform([home_team])[0]
away_encoded = le_away.transform([away_team])[0]

# Make prediction
prediction = np.array([[home_encoded, away_encoded]])
home_score = home_model.predict(prediction)[0]
away_score = away_model.predict(prediction)[0]

# Round scores to nearest integer
home_score_rounded = round(home_score)
away_score_rounded = round(away_score)

print(f"Predicted score for {home_team} vs {away_team}:")
print(f"Raw prediction: {home_score:.2f} - {away_score:.2f}")
print(f"Rounded prediction: {home_score_rounded} - {away_score_rounded}")

Predicted score for Chelsea vs Manchester City:
Raw prediction: 1.56 - 1.22
Rounded prediction: 2 - 1
