## Scrape Whoscored data

In [1]:
import pandas as pd
import numpy as np
import re
import json
import time
import pickle

from bs4 import BeautifulSoup as soup
from datetime import datetime as dt
from collections import OrderedDict
pd.options.mode.chained_assignment = None


try:
    from tqdm import trange
except ModuleNotFoundError:
    pass

from selenium import webdriver
from selenium.webdriver.common.by import By

## Get league urls

In [2]:
main_url = "https://www.whoscored.com/"
def getLeagueUrls(league_count, minimize_window=True):
    """
    league_count: count of top "X" leagues
    minimize_window: minimizes the selenium webdriver window
    """

    driver = webdriver.Chrome('chromedriver.exe')

    if minimize_window:
            driver.minimize_window()

    driver.get(main_url)

    league_names = []
    league_urls = []

    for i in range(league_count):
            league_name = driver.find_element("xpath", '//*[@id="popular-tournaments-list"]/li['+str(i+1)+']/a').text
            league_link = driver.find_element("xpath", '//*[@id="popular-tournaments-list"]/li['+str(i+1)+']/a').get_attribute('href')

            league_names.append(league_name)
            league_urls.append(league_link)

    leagues = {}
    for name,link in zip(league_names,league_urls):
        leagues[name] = link

    driver.close()

    return leagues

In [3]:
leagues = getLeagueUrls(league_count = 5)
leagues

  driver = webdriver.Chrome('chromedriver.exe')


{'Premier League': 'https://www.whoscored.com/Regions/252/Tournaments/2/England-Premier-League',
 'Serie A': 'https://www.whoscored.com/Regions/108/Tournaments/5/Italy-Serie-A',
 'LaLiga': 'https://www.whoscored.com/Regions/206/Tournaments/4/Spain-LaLiga',
 'Bundesliga': 'https://www.whoscored.com/Regions/81/Tournaments/3/Germany-Bundesliga',
 'Ligue 1': 'https://www.whoscored.com/Regions/74/Tournaments/22/France-Ligue-1'}

## Get match urls

In [4]:
def translateDate(data):
    """
    data: json data
    """

    for match in data:
        date = match['date'].split()
        match['date'] = ' '.join([TRANSLATE_DICT[date[0]], date[1], date[2]])

    return data


def getSortedData(data):

    try:
        data = sorted(data, key = lambda i: dt.strptime(i['date'], '%b %d %Y'))
        return data
    except ValueError:
        data = translateDate(data)
        data = sorted(data, key = lambda i: dt.strptime(i['date'], '%b %d %Y'))
        return data

In [5]:
def getFixtureData(driver):
    """
    driver: selenium webdriver
    """

    matches_ls = []
    table_rows = driver.find_elements(By.CLASS_NAME, 'divtable-row')
    for row in table_rows:
        match_dict = {}
        element = soup(row.get_attribute('innerHTML'))
        link_tag = element.find("a", {"class":"result-1 rc"})

        # Check type of link tag
        if type(link_tag) is type(None):
            date = row.text.split(', ')[-1]

        # if type(link_tag) is not type(None):
        else:
            match_dict['date'] = date
            match_dict['time'] = element.find('div', {'class':'col12-lg-1 col12-m-1 col12-s-0 col12-xs-0 time divtable-data'}).text
            match_dict['home'] = element.find_all("a", {"class":"team-link"})[0].text
            match_dict['away'] = element.find_all("a", {"class":"team-link"})[1].text
            match_dict['score'] = element.find("a", {"class":"result-1 rc"}).text
            match_dict['url'] = link_tag.get("href")

        # Append dictionary to matches
        matches_ls.append(match_dict)

    time.sleep(2)

    matches_ls = list(filter(None, matches_ls))

    return matches_ls

In [6]:
league_url = 'https://www.whoscored.com/Regions/206/Tournaments/4/Spain-LaLiga'

def getAllUrls(season:str):
    """
    season: enter season that you want. eg: "2022/23"
    """

    driver = webdriver.Chrome('chromedriver.exe')

    # Close webdriver window
    maximize_window = True
    if maximize_window:
        driver.maximize_window()

    # Get league url
    driver.get(league_url)
    time.sleep(5)

    # Get all seasons
    seasons = driver.find_element("xpath", '//*[@id="seasons"]').get_attribute('innerHTML').split(sep='\n')
    seasons = [i for i in seasons if i]

    # Select Specific season
    year = season
    for i in range(1, len(seasons)+1):
            if driver.find_element("xpath", '//*[@id="seasons"]/option['+str(i)+']').text == year:
                season = driver.find_element("xpath", '//*[@id="seasons"]/option['+str(i)+']').click()

    time.sleep(5)

    # Navigate to the fixtures page
    fixtures_page = driver.find_element("xpath", '//*[@id="link-fixtures"]')
    driver.execute_script("arguments[0].scrollIntoView();", fixtures_page)
    driver.execute_script("arguments[0].click();", fixtures_page)
    time.sleep(5)

    # Get month and store it in list
    months = []
    all_match_urls = []
    for i in range(12):
        month = driver.find_element("xpath", '//*[@id="date-config-toggle-button"]/span[1]').get_attribute('textContent')

        if month not in months:
            months.append(month)

            # Get match links
            match_urls = getFixtureData(driver=driver)

            # Get all urls
            for url in match_urls:
                if url not in all_match_urls:
                    all_match_urls.append(url)


            # Get previous month
            prev_month_btn = driver.find_element("xpath", '//*[@id="date-controller"]/a[1]').click()
            time.sleep(8)

        else:
            break

    return all_match_urls

In [7]:
all_match_urls = getAllUrls(season="2022/23")

  driver = webdriver.Chrome('chromedriver.exe')


In [8]:
all_match_urls

[{'date': 'Apr 1 2023',
  'time': '13:00',
  'home': 'Girona',
  'away': 'Espanyol',
  'score': '2 : 1',
  'url': '/Matches/1650825/Live/Spain-LaLiga-2022-2023-Girona-Espanyol'},
 {'date': 'Apr 1 2023',
  'time': '15:15',
  'home': 'Athletic Bilbao',
  'away': 'Getafe',
  'score': '0 : 0',
  'url': '/Matches/1650819/Live/Spain-LaLiga-2022-2023-Athletic-Bilbao-Getafe'},
 {'date': 'Apr 1 2023',
  'time': '17:30',
  'home': 'Cadiz',
  'away': 'Sevilla',
  'score': '0 : 2',
  'url': '/Matches/1650822/Live/Spain-LaLiga-2022-2023-Cadiz-Sevilla'},
 {'date': 'Apr 1 2023',
  'time': '20:00',
  'home': 'Elche',
  'away': 'Barcelona',
  'score': '0 : 4',
  'url': '/Matches/1650824/Live/Spain-LaLiga-2022-2023-Elche-Barcelona'},
 {'date': 'Apr 2 2023',
  'time': '13:00',
  'home': 'Celta Vigo',
  'away': 'Almeria',
  'score': '2 : 2',
  'url': '/Matches/1650828/Live/Spain-LaLiga-2022-2023-Celta-Vigo-Almeria'},
 {'date': 'Apr 2 2023',
  'time': '15:15',
  'home': 'Real Madrid',
  'away': 'Real Valla

## Get team urls

In [9]:
def getTeamUrls(team: str, data: list):
    """
    team: name of the team
    data: list of all json data
    """

    team_urls = []
    team = team
    for match in data:
        if match["home"] == team or match["away"] == team:
            team_urls.append(match)

    if team_urls:
        return team_urls
    else:
        print("Please check team name and try again :)")

In [10]:
team_urls = getTeamUrls(team = "Barcelona", data = all_match_urls)
team_urls

[{'date': 'Apr 1 2023',
  'time': '20:00',
  'home': 'Elche',
  'away': 'Barcelona',
  'score': '0 : 4',
  'url': '/Matches/1650824/Live/Spain-LaLiga-2022-2023-Elche-Barcelona'},
 {'date': 'Apr 10 2023',
  'time': '20:00',
  'home': 'Barcelona',
  'away': 'Girona',
  'score': '0 : 0',
  'url': '/Matches/1650829/Live/Spain-LaLiga-2022-2023-Barcelona-Girona'},
 {'date': 'Mar 5 2023',
  'time': '15:15',
  'home': 'Barcelona',
  'away': 'Valencia',
  'score': '1 : 0',
  'url': '/Matches/1650790/Live/Spain-LaLiga-2022-2023-Barcelona-Valencia'},
 {'date': 'Mar 12 2023',
  'time': '20:00',
  'home': 'Athletic Bilbao',
  'away': 'Barcelona',
  'score': '0 : 1',
  'url': '/Matches/1650799/Live/Spain-LaLiga-2022-2023-Athletic-Bilbao-Barcelona'},
 {'date': 'Mar 19 2023',
  'time': '20:00',
  'home': 'Barcelona',
  'away': 'Real Madrid',
  'score': '2 : 1',
  'url': '/Matches/1650810/Live/Spain-LaLiga-2022-2023-Barcelona-Real-Madrid'},
 {'date': 'Feb 1 2023',
  'time': '20:00',
  'home': 'Real Bet

## Get event data

In [11]:
def getMatchData(driver, url, display=True, close_window=True):
    driver.get(url)

    # get script data from page source
    script_content = driver.find_element("xpath", '//*[@id="layout-wrapper"]/script[1]').get_attribute('innerHTML')


    # clean script content
    script_content = re.sub(r"[\n\t]*", "", script_content)
    script_content = script_content[script_content.index("matchId"):script_content.rindex("}")]


    # this will give script content in list form
    script_content_list = list(filter(None, script_content.strip().split(',            ')))
    metadata = script_content_list.pop(1)


    # string format to json format
    match_data = json.loads(metadata[metadata.index('{'):])
    keys = [item[:item.index(':')].strip() for item in script_content_list]
    values = [item[item.index(':')+1:].strip() for item in script_content_list]
    for key,val in zip(keys, values):
        match_data[key] = json.loads(val)


    # get additional match details
    region = driver.find_element("xpath", '//*[@id="breadcrumb-nav"]/span[1]').text
    league = driver.find_element("xpath", '//*[@id="breadcrumb-nav"]/a').text.split(' - ')[0]
    season = driver.find_element("xpath", '//*[@id="breadcrumb-nav"]/a').text.split(' - ')[1]

    if len(driver.find_element("xpath", '//*[@id="breadcrumb-nav"]/a').text.split(' - ')) == 2:
        competition_type = 'League'
        competition_stage = ''

    elif len(driver.find_element("xpath", '//*[@id="breadcrumb-nav"]/a').text.split(' - ')) == 3:
        competition_type = 'Knock Out'
        competition_stage = driver.find_element("xpath", '//*[@id="breadcrumb-nav"]/a').text.split(' - ')[-1]

    else:
        print('Getting more than 3 types of information about the competition.')

    match_data['region'] = region
    match_data['league'] = league
    match_data['season'] = season
    match_data['competitionType'] = competition_type
    match_data['competitionStage'] = competition_stage


    # sort match_data dictionary alphabetically
    match_data = OrderedDict(sorted(match_data.items()))
    # match_data = dict(match_data)

    if display:
        print('Region: {}, League: {}, Season: {}, Match Id: {}'.format(region, league, season, match_data['matchId']))


    if close_window:
        driver.close()

    return match_data

In [12]:
# Get match links
links = []
for val in team_urls:
    if val["url"] not in links:
        links.append(val["url"])

links = [main_url[:-1]+i for i in links]
links

['https://www.whoscored.com/Matches/1650824/Live/Spain-LaLiga-2022-2023-Elche-Barcelona',
 'https://www.whoscored.com/Matches/1650829/Live/Spain-LaLiga-2022-2023-Barcelona-Girona',
 'https://www.whoscored.com/Matches/1650790/Live/Spain-LaLiga-2022-2023-Barcelona-Valencia',
 'https://www.whoscored.com/Matches/1650799/Live/Spain-LaLiga-2022-2023-Athletic-Bilbao-Barcelona',
 'https://www.whoscored.com/Matches/1650810/Live/Spain-LaLiga-2022-2023-Barcelona-Real-Madrid',
 'https://www.whoscored.com/Matches/1650725/Live/Spain-LaLiga-2022-2023-Real-Betis-Barcelona',
 'https://www.whoscored.com/Matches/1650751/Live/Spain-LaLiga-2022-2023-Barcelona-Sevilla',
 'https://www.whoscored.com/Matches/1650765/Live/Spain-LaLiga-2022-2023-Villarreal-Barcelona',
 'https://www.whoscored.com/Matches/1650769/Live/Spain-LaLiga-2022-2023-Barcelona-Cadiz',
 'https://www.whoscored.com/Matches/1650785/Live/Spain-LaLiga-2022-2023-Almeria-Barcelona',
 'https://www.whoscored.com/Matches/1650710/Live/Spain-LaLiga-2022

In [21]:
with open("event-data-2023-04-11 20:57:50.511532.pkl", "rb") as f:
    event_data = pickle.load(f)

    for link in links:
        print(link)

https://www.whoscored.com/Matches/1650824/Live/Spain-LaLiga-2022-2023-Elche-Barcelona
https://www.whoscored.com/Matches/1650829/Live/Spain-LaLiga-2022-2023-Barcelona-Girona
https://www.whoscored.com/Matches/1650790/Live/Spain-LaLiga-2022-2023-Barcelona-Valencia
https://www.whoscored.com/Matches/1650799/Live/Spain-LaLiga-2022-2023-Athletic-Bilbao-Barcelona
https://www.whoscored.com/Matches/1650810/Live/Spain-LaLiga-2022-2023-Barcelona-Real-Madrid
https://www.whoscored.com/Matches/1650725/Live/Spain-LaLiga-2022-2023-Real-Betis-Barcelona
https://www.whoscored.com/Matches/1650751/Live/Spain-LaLiga-2022-2023-Barcelona-Sevilla
https://www.whoscored.com/Matches/1650765/Live/Spain-LaLiga-2022-2023-Villarreal-Barcelona
https://www.whoscored.com/Matches/1650769/Live/Spain-LaLiga-2022-2023-Barcelona-Cadiz
https://www.whoscored.com/Matches/1650785/Live/Spain-LaLiga-2022-2023-Almeria-Barcelona
https://www.whoscored.com/Matches/1650710/Live/Spain-LaLiga-2022-2023-Atletico-Madrid-Barcelona
https://ww

In [22]:
# Get event data

with open("event-data-2023-04-11 20:57:50.511532.pkl", "rb") as f:
    event_data = pickle.load(f)


    count = len(links)
    data_crawl = False

    for link in links:

        match_id = re.findall(r'\d+', link)[0]
        count -= 1

        if match_id not in event_data.keys():
            # Scrape data
            driver = webdriver.Chrome('chromedriver.exe')
            data = getMatchData(driver = driver, url = link)

            # Convert collections to dictionary
            event_data[match_id] = dict(data)


            # Cache most recent crawl using timestamp
            if count == 0:

                print("----------- All matches scraped -----------")

                timestamp = dt.now()

                with open(f"event-data-{timestamp}.pkl", "wb")  as f:
                    pickle.dump(event_data, f)

            time.sleep(10)

        else:
            data_crawl = True

    if data_crawl == True:
        print("All data has been obtained!")


All data has been obtained!


## Create event data

In [340]:
def createEventsDF(data):

    events = data['events']
    for event in events:
        event.update({'matchId' : data['matchId'],
                     'startDate' : data['startDate'],
                     'startTime' : data['startTime'],
                     'score' : data['score'],
                     'ftScore' : data['ftScore'],
                     'htScore' : data['htScore'],
                     'etScore' : data['etScore'],
                     'venueName' : data['venueName'],
                     'maxMinute' : data['maxMinute']})
    events_df = pd.DataFrame(events)


    # clean period column
    events_df['period'] = pd.json_normalize(events_df['period'])['displayName']

    # clean type column
    events_df['type'] = pd.json_normalize(events_df['type'])['displayName']

    # clean outcomeType column
    events_df['outcomeType'] = pd.json_normalize(events_df['outcomeType'])['displayName']

    # clean outcomeType column
    try:
        x = events_df['cardType'].fillna({i: {} for i in events_df.index})
        events_df['cardType'] = pd.json_normalize(x)['displayName'].fillna(False)
    except KeyError:
        events_df['cardType'] = False

    # clean satisfiedEventTypes column
    eventTypeDict = data['matchCentreEventTypeJson']
    for i in range(len(events_df)):
        row = events_df.loc[i, 'satisfiedEventsTypes'].copy()
        events_df['satisfiedEventsTypes'].loc[i] = [list(eventTypeDict.keys())[list(eventTypeDict.values()).index(event)] for event in row]

    # clean qualifiers column
    try:
        for i in events_df.index:
            row = events_df.loc[i, 'qualifiers'].copy()
            if len(row) != 0:
                for irow in range(len(row)):
                    row[irow]['type'] = row[irow]['type']['displayName']
    except TypeError:
        pass

    # clean isShot column
    if 'isShot' in events_df.columns:
        events_df['isShot'] = events_df['isShot'].replace(np.nan, False)
    else:
        events_df['isShot'] = False

    # clean isGoal column
    if 'isGoal' in events_df.columns:
        events_df['isGoal'] = events_df['isGoal'].replace(np.nan, False)
    else:
        events_df['isGoal'] = False

    # add player name column
    events_df.loc[events_df.playerId.notna(), 'playerId'] = events_df.loc[events_df.playerId.notna(), 'playerId'].astype(int).astype(str)
    player_name_col = events_df.loc[:, 'playerId'].map(data['playerIdNameDictionary'])
    events_df.insert(loc=events_df.columns.get_loc("playerId")+1, column='playerName', value=player_name_col)

    # add home/away column
    h_a_col = events_df['teamId'].map({data['home']['teamId']:'h', data['away']['teamId']:'a'})
    events_df.insert(loc=events_df.columns.get_loc("teamId")+1, column='h_a', value=h_a_col)

    # adding shot body part column
    events_df['shotBodyType'] =  np.nan
    for i in events_df.loc[events_df.isShot==True].index:
        for j in events_df.loc[events_df.isShot==True].qualifiers.loc[i]:
            if j['type'] == 'RightFoot' or j['type'] == 'LeftFoot' or j['type'] == 'Head' or j['type'] == 'OtherBodyPart':
                events_df['shotBodyType'].loc[i] = j['type']

    # adding shot situation column
    events_df['situation'] =  np.nan
    for i in events_df.loc[events_df.isShot==True].index:
        for j in events_df.loc[events_df.isShot==True].qualifiers.loc[i]:
            if j['type'] == 'FromCorner' or j['type'] == 'SetPiece' or j['type'] == 'DirectFreekick':
                events_df['situation'].loc[i] = j['type']
            if j['type'] == 'RegularPlay':
                events_df['situation'].loc[i] = 'OpenPlay'

    # adding other event types columns
    event_types = list(data['matchCentreEventTypeJson'].keys())
    for event_type in event_types:
        events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])

    return events_df

In [341]:
# Get final dataframe
match_id_list = []
df_final = pd.DataFrame()

for key,val in event_data.items():

    if key not in match_id_list:
        df = createEventsDF(event_data[key])
        df_final = pd.concat([df_final, df]).reset_index(drop = True)

        match_id_list.append(key)

# Add scraped match ids to pickle file
with open('match_ids.pkl', 'wb') as f:
    pickle.dump(match_id_list, f)


  events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])
  events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])
  events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])
  events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])
  events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])
  events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])
  events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])
  events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])
  events_df[event_type] = pd.Series([event_type in row for row in list(events_df['satisfiedEventsTypes'])])
  events_df[event_type] = pd

## Save dataframe

In [346]:
def saveDF(prompt:"no"):

    if prompt.upper() == "YES":
        df_final.to_pickle("./barca.pkl")

    elif prompt.upper() == "NO":
        pass

    else:
        print("Please provide YES/NO prompt only.")

# saveDF(prompt="yes")