In [1]:
import re
import requests
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pymongo import MongoClient
import numpy as np

In [2]:
# Assuming you have already established a connection to the MongoDB collection
# Connect to MongoDB
client = MongoClient("mongodb+srv://shelgir:footyforecast00@footyforecast.djopw0p.mongodb.net/?retryWrites=true&w=majority")
db = client["FootyForecast"]
collection = db["matches"]

# Retrieve the documents from the collection
matches = collection.find({})

# Create empty lists to store the data
home_scores = []
away_scores = []
home_ratings = []
away_ratings = []
total_shots_home = []
total_shots_away = []
shots_on_target_home = []
shots_on_target_away = []
home_team_ids = []  # Store home team IDs
away_team_ids = []  # Store away team IDs

# Extract data from the documents
for match in matches:
    home_scores.append(match['home']['score'])
    away_scores.append(match['away']['score'])
    home_ratings.append(match['home']['rating'])
    away_ratings.append(match['away']['rating'])
    total_shots_home.append(match['home']['totalShots'])
    total_shots_away.append(match['away']['totalShots'])
    shots_on_target_home.append(match['home']['shotsOnTarget'])
    shots_on_target_away.append(match['away']['shotsOnTarget'])
    home_team_ids.append(match['home']['teamId'])  # Append home team ID
    away_team_ids.append(match['away']['teamId'])  # Append away team ID

# Create a DataFrame with the extracted data
data = {
    'home_scores': home_scores,
    'away_scores': away_scores,
    'home_ratings': home_ratings,
    'away_ratings': away_ratings,
    'total_shots_home': total_shots_home,
    'total_shots_away': total_shots_away,
    'shots_on_target_home': shots_on_target_home,
    'shots_on_target_away': shots_on_target_away,
    'home_team_ids': home_team_ids,
    'away_team_ids': away_team_ids
}

df = pd.DataFrame(data)

# Extract features and labels
X = df[['home_ratings', 'away_ratings', 'total_shots_home', 'total_shots_away',
        'shots_on_target_home', 'shots_on_target_away']]
y_home = df['home_scores']
y_away = df['away_scores']

# Check if the number of samples is sufficient for train-test split
if len(df) < 2:
    print("Insufficient data for train-test split.")
else:
    # Split the data into training and testing sets
    X_train, X_test, y_home_train, y_home_test, y_away_train, y_away_test = train_test_split(
        X,
        y_home,
        y_away,
        test_size=0.2,
        random_state=42
    )

    # Train separate regression models for home and away scores
    home_model = LinearRegression()
    home_model.fit(X_train, y_home_train)

    away_model = LinearRegression()
    away_model.fit(X_train, y_away_train)

    # Predict the scores for the test set
    home_scores_pred = home_model.predict(X_test)
    away_scores_pred = away_model.predict(X_test)

    # Create a DataFrame with predicted and actual scores, including 'teamId' for home and away teams
    results = pd.DataFrame({
        'Home TeamId': X_test.index.map(df['home_team_ids']),
        'Home Actual': y_home_test,
        'Home Predicted': home_scores_pred,
        'Away TeamId': X_test.index.map(df['away_team_ids']),
        'Away Actual': y_away_test,
        'Away Predicted': away_scores_pred
    })

    # Display the results
    print(results)


   Home TeamId  Home Actual  Home Predicted Away TeamId  Away Actual  \
17          30            0        1.225698          13            2   
13         161            0        0.471875          13            2   
4           13            2        3.073318          24            1   
29          26            2        1.998809          13            2   
35          13            0        0.251587         211            3   
25          13            3        3.195781         183            2   
6          189            0        0.641200          13            3   
26         170            0        0.251352          13            3   

    Away Predicted  
17        3.274326  
13        2.113741  
4         1.532272  
29        2.251096  
35        3.360325  
25        1.551859  
6         3.211637  
26        3.495280  


In [3]:
# Count the number of matches in the collection
num_matches = collection.count_documents({})

# Print the number of matches
print("Number of matches:", num_matches)


Number of matches: 37


In [82]:
results.to_csv('prediction_results.csv', index=False)

In [4]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predict the scores for the test set
home_scores_pred = home_model.predict(X_test)
away_scores_pred = away_model.predict(X_test)

# Calculate the metrics for home scores
home_mse = mean_squared_error(y_home_test, home_scores_pred)
home_mae = mean_absolute_error(y_home_test, home_scores_pred)
home_r2 = r2_score(y_home_test, home_scores_pred)

# Calculate the metrics for away scores
away_mse = mean_squared_error(y_away_test, away_scores_pred)
away_mae = mean_absolute_error(y_away_test, away_scores_pred)
away_r2 = r2_score(y_away_test, away_scores_pred)

# Display the metrics
print("Home Scores:")
print("Mean Squared Error (MSE):", home_mse)
print("Mean Absolute Error (MAE):", home_mae)
print("R-squared Score:", home_r2)
print()
print("Away Scores:")
print("Mean Squared Error (MSE):", away_mse)
print("Mean Absolute Error (MAE):", away_mae)
print("R-squared Score:", away_r2)


Home Scores:
Mean Squared Error (MSE): 0.43161949194092974
Mean Absolute Error (MAE): 0.5140001932454612
R-squared Score: 0.6824868105262126

Away Scores:
Mean Squared Error (MSE): 0.3254952842591625
Mean Absolute Error (MAE): 0.4608521987519627
R-squared Score: 0.25601077883619994


In [5]:
team_ids = [167, 13, 23, 32, 26, 211, 30, 189, 24, 15, 31, 162, 170, 29, 14, 183, 161, 18, 19, 174]
team_names = ['Manchester City', 'Arsenal', 'Newcastle', 'Manchester United', 'Liverpool', 'Brighton',
              'Tottenham', 'Brentford', 'Aston Villa', 'Chelsea', 'Everton', 'Crystal Palace', 'Fulham',
              'West Ham', 'Leicester', 'Bournemouth', 'Wolverhampton', 'Southampton', 'Leeds', 'Nottingham Forest']

team_dict = dict(zip(team_ids, team_names))

# Create a DataFrame with predicted and actual scores, including 'teamId' for home and away teams
results = pd.DataFrame({
    'Home TeamId': X_test.index.map(df['home_team_ids']),
    'Home Actual': y_home_test,
    'Home Predicted': home_scores_pred,
    'Away TeamId': X_test.index.map(df['away_team_ids']),
    'Away Actual': y_away_test,
    'Away Predicted': away_scores_pred
})
# Convert the team IDs in the results DataFrame to int
results['Home TeamId'] = results['Home TeamId'].astype(int)
results['Away TeamId'] = results['Away TeamId'].astype(int)

# Map team names to the results DataFrame using team_dict
results['Home Team'] = results['Home TeamId'].map(team_dict).fillna('Unknown')
results['Away Team'] = results['Away TeamId'].map(team_dict).fillna('Unknown')

# Reorder the columns for better readability
results = results.reindex(columns=['Home Team', 'Home TeamId', 'Home Actual', 'Home Predicted',
                                   'Away Team', 'Away TeamId', 'Away Actual', 'Away Predicted'])

# Calculate the absolute difference between actual and predicted scores
results['Home Error'] = abs(results['Home Actual'] - results['Home Predicted'])
results['Away Error'] = abs(results['Away Actual'] - results['Away Predicted'])

# Calculate the accuracy percentage for each match
home_accuracy = ((1 - results['Home Error'] / (results['Home Actual'] + 1)) * 100).round(2)
away_accuracy = ((1 - results['Away Error'] / (results['Away Actual'] + 1)) * 100).round(2)

# Apply threshold to accuracy percentages
threshold = 0  # Set the threshold value here
results['Home Accuracy (%)'] = abs(home_accuracy.where(home_accuracy >= threshold, home_accuracy))
results['Away Accuracy (%)'] = abs(away_accuracy.where(away_accuracy >= threshold, away_accuracy))

# Display the updated results
print(results)



        Home Team  Home TeamId  Home Actual  Home Predicted    Away Team  \
17      Tottenham           30            0        1.225698      Arsenal   
13  Wolverhampton          161            0        0.471875      Arsenal   
4         Arsenal           13            2        3.073318  Aston Villa   
29      Liverpool           26            2        1.998809      Arsenal   
35        Arsenal           13            0        0.251587     Brighton   
25        Arsenal           13            3        3.195781  Bournemouth   
6       Brentford          189            0        0.641200      Arsenal   
26         Fulham          170            0        0.251352      Arsenal   

    Away TeamId  Away Actual  Away Predicted  Home Error  Away Error  \
17           13            2        3.274326    1.225698    1.274326   
13           13            2        2.113741    0.471875    0.113741   
4            24            1        1.532272    1.073318    0.532272   
29           13            

In [7]:
# Convert the DataFrame to a list of dictionaries for easy insertion
results_dict_list = results.to_dict(orient='records')

# Insert the data into the 'trained' collection
db['trained'].insert_many(results_dict_list)

<pymongo.results.InsertManyResult at 0x213c1859e50>

In [51]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Calculate MAE for home and away teams
home_mae = mean_absolute_error(results['Home Actual'], results['Home Predicted'])
away_mae = mean_absolute_error(results['Away Actual'], results['Away Predicted'])

# Calculate RMSE for home and away teams
home_rmse = mean_squared_error(results['Home Actual'], results['Home Predicted'], squared=False)
away_rmse = mean_squared_error(results['Away Actual'], results['Away Predicted'], squared=False)

# Display the MAE and RMSE for home and away teams
print("Home Team MAE:", home_mae)
print("Home Team RMSE:", home_rmse)
print("Away Team MAE:", away_mae)
print("Away Team RMSE:", away_rmse)


Home Team MAE: 0.5140001932454612
Home Team RMSE: 0.6569775429502365
Away Team MAE: 0.4608521987519627
Away Team RMSE: 0.5705219402084047


In [6]:
# Define a tolerance for correctness (e.g., within 1 goal)
tolerance = 1

# Calculate the percentage of correct predictions for home and away teams
home_correct_percentage = ((abs(results['Home Actual'] - results['Home Predicted']) <= tolerance).mean()) * 100
away_correct_percentage = ((abs(results['Away Actual'] - results['Away Predicted']) <= tolerance).mean()) * 100

# Display the percentage of correct predictions for home and away teams
print("Home Team Correct Percentage:", home_correct_percentage)
print("Away Team Correct Percentage:", away_correct_percentage)


Home Team Correct Percentage: 75.0
Away Team Correct Percentage: 87.5
