# <center>Predicting Football Matches with Machine Learning</center>

we'll scrape football match data from the English Premier League.  

We'll download all of the matches for several seasons using Python and the requests library.  We'll then parse and clean the data using BeautifulSoup and pandas.  By the end, we'll have a single pandas DataFrame with all of the  EPL matches for multiple seasons.

We'll create predictors and train a machine learning model to predict the winner of each of the football matches.

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

> ### Part I: Web Scraping Footbal Matches from the EPL

In [2]:
years = list(range(2022, 2020, -1))
all_matches = []
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]
    links = [l.get('href') for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com/{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        
        soup = BeautifulSoup(data.text)
        links = [l.get('href') for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match='Shooting')[0]
        shooting.columns = shooting.columns.droplevel()
        
        try:
            team_data = matches.merge(shooting[['Date', "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on='Date')
        except ValueError:
            continue
            
        team_data = team_data[team_data['Comp'] == "Premier League"]
        team_data['Season'] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)        

In [4]:
match_df = pd.concat(all_matches)
match_df

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2021-05-02,19:15,Premier League,Matchweek 34,Sun,Away,L,0,4,Tottenham,...,Match Report,,8.0,1.0,17.4,0.0,0.0,0.0,2021,Sheffield United
39,2021-05-08,15:00,Premier League,Matchweek 35,Sat,Home,L,0,2,Crystal Palace,...,Match Report,,7.0,0.0,11.4,1.0,0.0,0.0,2021,Sheffield United
40,2021-05-16,19:00,Premier League,Matchweek 36,Sun,Away,W,1,0,Everton,...,Match Report,,10.0,3.0,17.0,0.0,0.0,0.0,2021,Sheffield United
41,2021-05-19,18:00,Premier League,Matchweek 37,Wed,Away,L,0,1,Newcastle Utd,...,Match Report,,11.0,1.0,16.0,1.0,0.0,0.0,2021,Sheffield United


In [6]:
match_df.columns = [c.lower() for c in match_df.columns]
match_df.to_csv("matches.csv")

> ### Part II: Maching Learning: Predicting football match winners

In [7]:
df = pd.read_csv('matches.csv', index_col=0)
df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [8]:
df.shape

(1520, 27)

In [9]:
df['team'].value_counts()

Leicester City              76
Leeds United                76
Brighton and Hove Albion    76
Chelsea                     76
Manchester United           76
Manchester City             76
Aston Villa                 76
Burnley                     76
Crystal Palace              76
Tottenham Hotspur           76
Southampton                 76
Wolverhampton Wanderers     76
Arsenal                     76
Everton                     76
West Ham United             76
Liverpool                   76
Newcastle United            76
Norwich City                38
Fulham                      38
Sheffield United            38
Watford                     38
Brentford                   38
West Bromwich Albion        38
Name: team, dtype: int64

In [12]:
df["round"].value_counts()

Matchweek 3     40
Matchweek 19    40
Matchweek 22    40
Matchweek 1     40
Matchweek 33    40
Matchweek 4     40
Matchweek 38    40
Matchweek 2     40
Matchweek 27    40
Matchweek 7     40
Matchweek 11    40
Matchweek 24    40
Matchweek 36    40
Matchweek 18    40
Matchweek 5     40
Matchweek 10    40
Matchweek 17    40
Matchweek 30    40
Matchweek 34    40
Matchweek 6     40
Matchweek 29    40
Matchweek 16    40
Matchweek 8     40
Matchweek 26    40
Matchweek 35    40
Matchweek 14    40
Matchweek 31    40
Matchweek 12    40
Matchweek 20    40
Matchweek 9     40
Matchweek 23    40
Matchweek 32    40
Matchweek 21    40
Matchweek 25    40
Matchweek 15    40
Matchweek 13    40
Matchweek 37    40
Matchweek 28    40
Name: round, dtype: int64

In [13]:
# Cleaning the data
df.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf                int64
ga                int64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [15]:
# Creating predictors for ML
# Getting the date
df['date'] = pd.to_datetime(df['date'])
# Getting venue into nu for ML
df['venue_code'] = df['venue'].astype('category').cat.codes
# creating unique code
df["opp_code"] = df['opponent'].astype("category").cat.codes
# Removing the minute and just keeping the hour
df["hour"] = df['time'].str.replace(":.+", "", regex=True).astype('int')
df["day_code"] = df['date'].dt.dayofweek

In [16]:
# Getting the target variable for the model
df["target"] = (df['result'] == "W").astype('int') # getting it into a number

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [19]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [20]:
# Spliting our dataset into train and test dataset
train = df[df["date"] < "2022-01-01"]
test = df[df['date'] > '2022-01-01']
# We select the variables for our model
predictors = ['venue_code', "opp_code", 'hour', "day_code"]

rf.fit(train[predictors], train['target'])

preds = rf.predict(test[predictors])

In [23]:
acc = accuracy_score(test['target'], preds)
acc

0.6056701030927835

In [24]:
# Checking on the prediction of our model
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds))
pd.crosstab(index=combined['actual'], columns=combined['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,186,47
1,106,49


In [26]:
precision_score(test['target'], preds)

0.5104166666666666

**Observation:**
- The precision is 51%, that means, when we predict a win, the team actually win 51%.

In [35]:
# Improve the accuracy of our model
grouped_matches = df.groupby('team')
#group = grouped_matches.get_group("Manchester City")

In [36]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [37]:
cols = ["gf", "ga", "sh", "sot", "dist", 'fk', 'pk', "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [39]:
matches_rolling = df.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')

In [41]:
matches_rolling.index = range(matches_rolling.shape[0])

In [48]:
def make_predictions(data, predictors):
    train = data[data["date"] < "2022-01-01"]
    test = data[data['date'] > '2022-01-01']
    rf.fit(train[predictors], train['target'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test['target'], predicted=preds), index=test.index)
    precision = precision_score(test['target'], preds)
    return combined, precision

In [50]:
combined, precission = make_predictions(matches_rolling, predictors + new_cols)

In [51]:
precission

0.6027397260273972

In [52]:
combined

Unnamed: 0,actual,predicted
55,0,0
56,1,1
57,1,0
58,1,1
59,1,1
...,...,...
1443,0,0
1444,0,0
1445,0,0
1446,0,0


In [53]:
combined = combined.merge(matches_rolling[['date', 'team', 'opponent', "result"]], left_index=True, right_index=True)
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
55,0,0,2022-01-23,Arsenal,Burnley,D
56,1,1,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,1,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
1443,0,0,2022-04-30,Wolverhampton Wanderers,Brighton,L
1444,0,0,2022-05-07,Wolverhampton Wanderers,Chelsea,D
1445,0,0,2022-05-11,Wolverhampton Wanderers,Manchester City,L
1446,0,0,2022-05-15,Wolverhampton Wanderers,Norwich City,D


In [54]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {"Brigthon and Hove Albion": "Brighton",
             "Manchester United": "Manchester Utd",
             "Newcastle United": "Newcastle Utd",
             "Tottenham Hotspur": "Tottenham",
             "West Ham United": "West Ham",
             "Wolverhampton Wanderers": "Wolves"}
mapping = MissingDict(**map_values)

In [56]:
combined["new_team"] = combined["team"].map(mapping)

In [57]:
df_merged = combined.merge(combined, left_on=["date", "new_team"], right_on=['date', 'opponent'])

In [58]:
df_merged[(df_merged['predicted_x'] == 1) & (df_merged["predicted_y"] == 0)]["actual_x"].value_counts()

1    42
0    27
Name: actual_x, dtype: int64

In [59]:
42/69

0.6086956521739131