In [2]:
# Importing libraries
import pandas as pd
import os

# Importing libraries that we have created
import Utilities as ut

## Preparation

In order to calculate the bookmaker's accuracy we decided to create a new dataframe made of all the premier league csv files, selecting only the columns strictly necessary for our calculations.

In [3]:
# Create an empty dataframe
odd_data = pd.DataFrame(None)

# Check the dataset Folder
dataset_position_PL = "dataset/Premier League/"

# Pick up all the files inside that folder
file_list_PL = os.listdir(dataset_position_PL) 


for element in file_list_PL:
    # Open the csv file
    data = ut.open_csv(element,dataset_position_PL)
    
    # Adding the match week
    data = ut.match_week(data)
    
    # Append to a unique dataframe
    odd_data = odd_data.append(data, ignore_index = True)

# Sorting the rows by time
odd_data = odd_data.sort_values(by = ['timestamp'])

# Keeping only some columns
col_to_keep = ['MW','home_team_name', 'home_team_goal_count', 'away_team_name','away_team_goal_count','odds_ft_home_team_win','odds_ft_draw', 'odds_ft_away_team_win']
odd_data = odd_data[col_to_keep]
odd_data.head(3)


Unnamed: 0,MW,home_team_name,home_team_goal_count,away_team_name,away_team_goal_count,odds_ft_home_team_win,odds_ft_draw,odds_ft_away_team_win
380,1,Manchester United,1,Swansea City,2,1.39,5.02,9.99
381,1,Leicester City,2,Everton,2,3.24,3.43,2.38
382,1,Stoke City,0,Aston Villa,1,2.1,3.37,4.04


# Bookmaker
### Dataset preparation
Then we decided to calculate the real result for every match (we do this looking at how many goals the two teams scored)

In [4]:
# Adding the real result to the dataset
Result = []
for home, away in zip(odd_data['home_team_goal_count'], odd_data['away_team_goal_count']):
    
    # Match game result is Home team win
    if home > away:
        Result.append("H")
    
    # Match game result is Draw
    elif home == away:
        Result.append("D")
        
    # Match game result is Away team win
    else:
        Result.append("A")
        
# Adding the column to the dataset
odd_data["Result"] = Result
odd_data.head(3)

Unnamed: 0,MW,home_team_name,home_team_goal_count,away_team_name,away_team_goal_count,odds_ft_home_team_win,odds_ft_draw,odds_ft_away_team_win,Result
380,1,Manchester United,1,Swansea City,2,1.39,5.02,9.99,A
381,1,Leicester City,2,Everton,2,3.24,3.43,2.38,D
382,1,Stoke City,0,Aston Villa,1,2.1,3.37,4.04,A


Now we have to understand what the most likely result is according to the bookmaker. This is important in order to compute the bookmaker's accuracy. We assume that the lowest odd is the most probable result.

In [5]:
# Check what the bookmaker predicts based on the lowest odd
Prediction_odd = []

for home, draw, away in zip(odd_data['odds_ft_home_team_win'], odd_data['odds_ft_draw'], odd_data['odds_ft_away_team_win']):
    odd_list = [home, draw, away]
    
    # Match game result is Home team win
    if min(odd_list) == home:
        Prediction_odd.append("H")
        
    # Match game result is Draw
    elif min(odd_list) == draw:
        Prediction_odd.append("D")
        
    # Match game result is Away team win
    else:
        Prediction_odd.append("A")

# Adding the column to the dataset
odd_data["Prediction_odd"] = Prediction_odd
odd_data.head(3)

Unnamed: 0,MW,home_team_name,home_team_goal_count,away_team_name,away_team_goal_count,odds_ft_home_team_win,odds_ft_draw,odds_ft_away_team_win,Result,Prediction_odd
380,1,Manchester United,1,Swansea City,2,1.39,5.02,9.99,A,H
381,1,Leicester City,2,Everton,2,3.24,3.43,2.38,D,A
382,1,Stoke City,0,Aston Villa,1,2.1,3.37,4.04,A,H


### Accuracy

Now we calculate the accuracy of the bookmaker.

In [15]:
# Check if the prediction is True or False
odd_data["Check"] = ut.check_the_pred(odd_data, 'Result', 'Prediction_odd')

# Print the accuracy of Bookmaker
accuracy =  list(odd_data["Check"]).count(True) / odd_data.shape[0]
print( 'The accuracy of bookmaker is: {:.2f} %'.format(accuracy*100))

The accuracy of bookmaker is: 55.00 %


# Baseline 
For the creation of our baseline we assumed that a particular team will win the match when its overall is at least 1 point higher than the overall of the other team. If the difference between the overalls is close to 0, we will assume that a draw will happen.
### Dataset preparation

In [12]:
# Open the csv file
baseline = ut.open_csv("dataset.csv", "dataset/Other/")

# Keep only some columns
baseline = baseline[["MW", "Result","Diff_ov"]]

# Doing the Baseline, based on the overall
B_res = []

for i in round (baseline["Diff_ov"]):
    if i == 0:
        B_res.append("D")
    elif i >= 1:
        B_res.append("H")
    else:
        B_res.append("A")

# Adding the column to the dataframe
baseline['B_res'] = B_res

### Accuracy

In [14]:
# Check if the prediction is True or False
baseline["Check"] = ut.check_the_pred(baseline, 'Result', 'B_res')

# Print the accuracy of Baseline
accuracy =  list(baseline["Check"]).count(True) / baseline.shape[0]
print( 'The accuracy of baseline is: {:.2f} %'.format(accuracy*100))

The accuracy of baseline is: 50.55 %
