Author: Roel Faber

In [2]:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import pandas as pd
import time



Start automatic browser and go to website with football statistics

In [51]:
browser = webdriver.Chrome()
browser.get('https://www.voetbal.com/wedstrijdgegevens/ned-eredivisie-2019-2020-spieltag/11/')

Don't agree to use cookies

In [53]:
browser.find_element_by_class_name("qc-cmp-button").click()
browser.find_element_by_class_name("qc-cmp-save-and-exit").click()

In [9]:
data = requests.get(browser.current_url)

In [10]:
soup = BeautifulSoup(data.text,'html.parser')

In [12]:
table = soup.findAll('table', { 'class' : 'standard_tabelle' })
teams = table[1].select("a[href*=teams]")
teams
teamslist = []
for team in teams:
    teamslist.append(team.get_text())
teamslist

['Vitesse',
 'FC Utrecht',
 'VVV-Venlo',
 'PEC Zwolle',
 'PSV',
 'Feyenoord',
 'Heracles Almelo',
 'SBV Excelsior',
 'FC Groningen',
 'sc Heerenveen',
 'AZ Alkmaar',
 'AFC Ajax',
 'FC Twente',
 'Willem II',
 'Roda JC Kerkrade',
 'NAC Breda',
 'ADO Den Haag',
 'Sparta Rotterdam']

In [57]:
games = soup.select("a[title*=Wedstrijddetails]")

In [3]:
def get_teamslist(soup):
    table = soup.findAll('table', { 'class' : 'standard_tabelle' })
    teams = table[1].select("a[href*=teams]")
    teams
    teamslist = []
    for team in teams:
        teamslist.append(team.get_text())
    return(teamslist)
    
def team_identification(game,teamslist):
    """Identify the home and awayteams of a game"""
    for team in teamslist:
        if team in game['title'].split(' -')[0]:
            hometeam = team
        elif team in game['title'].split(' -')[1]:
            awayteam = team
    return(hometeam,awayteam)
    
def goals(game):
    """Identify the goals scored and the result of the game"""
    goals_scored = game.get_text().split()[0]
    homegoals = goals_scored.split(':')[0]
    awaygoals = goals_scored.split(':')[1]
    if homegoals>awaygoals:
        return([homegoals,awaygoals,1])
    elif awaygoals>homegoals:
        return([homegoals,awaygoals,2])
    else:
        return([homegoals,awaygoals,3])
    
def gamestats(game, teamslist):
    """Combine the team identification and result"""
    hometeam, awayteam = team_identification(game, teamslist)
    resultlist = goals(game)
    homegoals = resultlist[0]
    awaygoals = resultlist[1]
    result = resultlist[2]
    if result == 1:
        homepoints = 3
        awaypoints = 0
    elif result == 2:
        homepoints = 0
        awaypoints = 3
    else:
        homepoints = 1
        awaypoints = 1
    return({"Home":hometeam,"Away":awayteam,"HomeGoals":homegoals,"AwayGoals":awaygoals,"Result":result,
            "HomePoints":homepoints,"AwayPoints":awaypoints})

In [59]:
columns = ["Season","Round","Home","Away","HomeGoals","AwayGoals","Result","HomePoints","AwayPoints"]
df = pd.DataFrame(columns=columns)
for game in games:
    resultdict = gamestats(game,teamslist)
    df = df.append(resultdict,ignore_index=True)

In [60]:
df

Unnamed: 0,Season,Round,Home,Away,HomeGoals,AwayGoals,Result,HomePoints,AwayPoints
0,,,FC Twente,FC Emmen,4,1,1,3,0
1,,,Willem II,RKC Waalwijk,2,1,1,3,0
2,,,Fortuna Sittard,VVV-Venlo,4,1,1,3,0
3,,,Vitesse,ADO Den Haag,0,2,2,0,3
4,,,Heracles Almelo,PEC Zwolle,4,0,1,3,0
5,,,sc Heerenveen,FC Groningen,1,1,3,1,1
6,,,PSV,AZ Alkmaar,0,4,2,0,3
7,,,Sparta Rotterdam,FC Utrecht,1,2,2,0,3
8,,,AFC Ajax,Feyenoord,4,0,1,3,0


In [6]:
def get_teamslist_games(browser,season,roundnr):
    """Get all the games for a given season and roundnr
    
    Parameters
    -------------
    browser, Selenium browserobject
    
    season, str
        String with format: {firstyear}-{secondyear}
    
    roundnr, str
        String of the roundnr (digit between 1 and 34)"""
    url = f"https://www.voetbal.com/wedstrijdgegevens/ned-eredivisie-{season}-spieltag/{roundnr}/"
    browser.get(url)
    try:
        browser.find_element_by_class_name("qc-cmp-button").click()
        browser.find_element_by_class_name("qc-cmp-save-and-exit").click()
    except:
        try:
            time.sleep(2)
            browser.find_element_by_class_name("qc-cmp-button").click()
            browser.find_element_by_class_name("qc-cmp-save-and-exit").click()
        except:
            pass
    data = requests.get(browser.current_url)
    soup = BeautifulSoup(data.text,'html.parser')
    games = soup.select("a[title*=Wedstrijddetails]")
    table = soup.findAll('table', { 'class' : 'standard_tabelle' })
    teams = table[1].select("a[href*=teams]")
    teams
    teamslist = []
    for team in teams:
        teamslist.append(team.get_text())
    return teamslist, games

In [13]:
browser = webdriver.Chrome()
teamslist, games = get_teamslist_games(browser,season="2017-2018",roundnr="1")
for game in games:
    gamestats(game,teamslist)

In [14]:
teamslist

['Vitesse',
 'FC Utrecht',
 'VVV-Venlo',
 'PEC Zwolle',
 'PSV',
 'Feyenoord',
 'Heracles Almelo',
 'SBV Excelsior',
 'FC Groningen',
 'sc Heerenveen',
 'AZ Alkmaar',
 'AFC Ajax',
 'FC Twente',
 'Willem II',
 'Roda JC Kerkrade',
 'NAC Breda',
 'ADO Den Haag',
 'Sparta Rotterdam']

In [24]:
columns = ["Season","Round","Home","Away","HomeGoals","AwayGoals","Result","HomePoints","AwayPoints"]
df = pd.DataFrame(columns=columns)
roundnrs = range(1,35)
startjaar = 2019
browser = webdriver.Chrome()
while startjaar < 2020:
    season = f"{startjaar}-{startjaar+1}"
    for roundnr in roundnrs:
        teamslist, games = get_teamslist_games(browser,season=season,roundnr=str(roundnr))
        for game in games:
            resultdict = gamestats(game,teamslist)
            resultdict["Season"] = season
            resultdict["Round"] = roundnr
            df = df.append(resultdict,ignore_index=True)
    startjaar+=1

In [16]:
roundnrs = range(1,35)

In [21]:
df.to_csv('Data/matches.csv')
df.to_pickle('Data/matches.pkl')