In [8]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np
import openpyxl
import math

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [9]:
# list of team names currently in associated leagues (including relegated teams from last season to fill DF), as they are displayed on Understat.

epl_rank = ['Manchester City', 'Arsenal', 'Liverpool', 'Manchester United', 'Newcastle United', 'Tottenham', 'Brighton',
            'Aston Villa', 'Brentford', 'Chelsea', 'Fulham', 'Crystal Palace', 'Wolverhampton Wanderers', 'Everton',
            'West Ham', 'Nottingham Forest', 'Bournemouth', 'Leicester', 'Southampton','Burnley','Luton','Sheffield United',
            'Leeds']
ligue1_rank = ['Ajaccio', 'Marseille', 'Auxerre', 'Lens', 'Monaco', 'Toulouse', 'Nantes', 'Angers', 'Lorient',
               'Strasbourg', 'Nice', 'Lyon', 'Paris Saint Germain', 'Clermont Foot', 'Brest', 'Rennes', 'Reims',
               'Montpellier', 'Troyes', 'Lille','Metz','Le Havre']
bundesliga_rank = ['Borussia Dortmund', 'Mainz 05', 'RasenBallsport Leipzig', 'Schalke 04', 'Union Berlin',
                   'Werder Bremen', 'FC Cologne', 'Bayern Munich', 'Borussia M.Gladbach', 'Augsburg',
                   'Eintracht Frankfurt', 'Freiburg', 'Wolfsburg', 'Hertha Berlin', 'Bochum', 'Bayer Leverkusen',
                   'VfB Stuttgart', 'Hoffenheim','FC Heidenheim','Darmstadt']
serieA_rank = ['Sassuolo', 'Fiorentina', 'Torino', 'Inter', 'Cremonese', 'Salernitana', 'Empoli', 'Lazio', 'Napoli',
               'Sampdoria', 'Atalanta', 'Monza', 'Lecce', 'Bologna', 'AC Milan', 'Verona', 'Roma', 'Spezia', 'Udinese',
               'Juventus','Cagliari','Frosinone','Genoa']
laliga_rank = ['Real Madrid', 'Athletic Club', 'Villarreal', 'Athletico Madrid', 'Celta Vigo', 'Barcelona',
               'Real Valladolid', 'Getafe', 'Elche', 'Cadiz', 'Espanyol', 'Almeria', 'Osasuna', 'Girona', 'Real Betis',
               'Valencia', 'Mallorca', 'Rayo Vallecano', 'Real Sociedad', 'Sevilla','Granada','Alaves','Las Palmas']

In [163]:
# Due to irregularities of ID patterns (gaps, ignored leagues etc.) manual ID selection may be best choice for now, rather that automated refresh

# LAST ID (15/02/2024):
# 2022/23
# ENG - 18202 - 18581 (380, 38*10)
eng22_23 = [a + 18202 for a in range(0,380)]
eng22_23_count = 0
# FRA - 19648 - 20027 (380, 38*10)
fra22_23 = [a + 19648 for a in range(0,380)] #20 teams last season
fra22_23_count = 0
# GER - 19342 - 19647 (306, 34*9)
ger22_23 = [a + 19342 for a in range(0,306)]
ger22_23_count = 0
# ITA - 18582 - 18961 (380, 38*10)
ita22_23 = [a + 18582 for a in range(0,380)]
ita22_23_count = 0
# SPA - 18962 - 19341 (380, 38*10)
spa22_23 = [a + 18962 for a in range(0,380)]
spa22_23_count = 0
# RUS - 20028 - 20267 (240, 30*8) - data ignored for now

#2023/24
# ENG - 21925 - 22304
eng23_24 = [a + 21925 for a in range(0,380)]
eng23_24_count = 0
# FRA - 23371 - 23676
fra23_24 = [a + 23371 for a in range(0,306)] #18 teams this season
fra23_24_count = 0
# GER - 23065 - 23370
ger23_24 = [a + 23065 for a in range(0,306)]
ger23_24_count = 0
# ITA - 22305 - 22684
ita23_24 = [a + 22305 for a in range(0,380)]
ita23_24_count = 0
# SPA - 22685 - 23064
spa23_24 = [a + 22685 for a in range(0,380)]
spa23_24_count = 0
# RUS - 21685 - 21924 - data ignored for now

eng_df = pd.DataFrame()
fra_df = pd.DataFrame()
ger_df = pd.DataFrame()
ita_df = pd.DataFrame()
spa_df = pd.DataFrame()

# Current ID (15/02/2024)
# ENG: 22126
# EPL count = 236
# FRA: 23555
# LG1 count = 189
# GER: 23253
# Bun count = 189
# ITA: 22538
# SeA count = 237
# SPA: 22915
# LLA count = 240

                            


In [160]:
# webscraping understat for match data

def matchstat_generator(league_count,ov_df):
    
    # reset df
    ov_df = ov_df.drop(ov_df.index, inplace=True)

    for match_id in league_count:
    
        base_url ='https://understat.com/match/'
        url = base_url+str(match_id)
        
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'lxml')
        scripts = soup.find_all('script')

        strings = scripts[1].string

        ind_start = strings.index("('")+2
        ind_end = strings.index("')")
        json_data = strings[ind_start:ind_end]
        json_data = json_data.encode('utf8').decode('unicode_escape')

        data = json.loads(json_data)


        x = []
        y = []
        xG = []
        result = []
        team = []
        minute = []
        data_away = data['a']
        data_home = data['h']

        for index in range(len(data_home)):
            for key in data_home[index]:
                # print(key)
                if key == 'X':
                    x.append(data_home[index][key])
                if key == 'Y':
                    y.append(data_home[index][key])
                if key == 'h_team':
                    team.append(data_home[index][key])
                if key == 'xG':
                    xG.append(float(data_home[index][key]))
                if key == 'result':
                    result.append(data_home[index][key])
                if key == 'minute':
                    minute.append(data_home[index][key])

        for index in range(len(data_away)):
            for key in data_away[index]:
                if key == 'X':
                    x.append(data_away[index][key])
                if key == 'Y':
                    y.append(data_away[index][key])
                if key == 'a_team':
                    team.append(data_away[index][key])
                if key == 'xG':
                    xG.append(float(data_away[index][key]))
                if key == 'result':
                    result.append(data_away[index][key])
                if key == 'minute':
                    minute.append(data_away[index][key])

        col_names = ['x','y','xG','result','team','minute']

        df_orig = pd.DataFrame([x,y,xG,result,team,minute],index=col_names)

        df = df_orig.T
        df['minute'] = df['minute'].astype(int)

        # team retrieval
        team_list = []
        sorted_team_list = []
        team_list.append(df.team.unique())
        home_team = team_list[0][0]
        away_team = team_list[0][1]
        sorted_team_list.append(home_team)
        sorted_team_list.append(away_team)
        sorted_team_list.sort()

        # xG and xGC sum
        expected_data_sums = df.groupby([team]).xG.sum().reset_index()

        if sorted_team_list[0] == home_team:
            xG_home = round(expected_data_sums.xG[0],3)
            xG_away = round(expected_data_sums.xG[1],3)
        else:
            xG_home = round(expected_data_sums.xG[1],3)
            xG_away = round(expected_data_sums.xG[0],3)

        # Shots faced sum
        shot_data_sums = df.groupby([team]).xG.count().reset_index()

        if sorted_team_list[0] == home_team:
            shots_h = shot_data_sums.xG[0]
            shots_a = shot_data_sums.xG[1]
        else:
            shots_h = shot_data_sums.xG[1]
            shots_a = shot_data_sums.xG[0]

        # Quality shots faced sum (over .25 xG)
        h_quality_shot_data_sums = len(df[(df["team"] == home_team) & (df["xG"] >= 0.25)])
        a_quality_shot_data_sums = len(df[(df["team"] == away_team) & (df["xG"] >= 0.25)])

        #goals
        home_array = [home_team]
        away_array = [away_team]
        home_goal_loc = (df.loc[(df['result'] == 'Goal') & df['team'].isin(home_array)]) + (df.loc[(df['result'] == 'OwnGoal') & df['team'].isin(away_array)])
        away_goal_loc = (df.loc[(df['result'] == 'Goal') & df['team'].isin(away_array)]) + (df.loc[(df['result'] == 'OwnGoal') & df['team'].isin(home_array)])

        first_goal_h = home_goal_loc.minute.min()
        first_goal_a = away_goal_loc.minute.min()

        if math.isnan(first_goal_h) is False and math.isnan(first_goal_a) is False:
            if first_goal_h > first_goal_a:
                first_h = 0
                first_a = 1
            elif first_goal_h < first_goal_a:
                first_h = 1
                first_a = 0
            else:
                first_h = 2
                first_a = 2
        elif math.isnan(first_goal_h) is True and math.isnan(first_goal_a) is False:
            first_h = 0
            first_a = 1
        elif math.isnan(first_goal_h) is False and math.isnan(first_goal_a) is True:
            first_h = 1
            first_a = 0
        else:
            first_h = 0
            first_a = 0


        home_goals = len(home_goal_loc)
        away_goals = len(away_goal_loc)

        # points
        if home_goals > away_goals:
            points_home = 3
            points_away = 0
        elif home_goals < away_goals:
            points_home = 0
            points_away = 3
        else:
            points_home = 1
            points_away = 1
        
        # id
        game_id_h = home_team+str(match_id)
        game_id_a = away_team+str(match_id)

        # btts
        if home_goals >= 1 and away_goals >= 1:
            btts = 1
            cs = 0
            blank = 0
        elif home_goals >= 1 and away_goals == 0:
            btts = 0
            cs = 1
            blank = 0
        elif home_goals == 0 and away_goals >= 1:
            btts = 0
            cs = 0
            blank = 1
        else:
            btts = 0
            cs = 1
            blank = 1

        df2_1 = pd.DataFrame([[game_id_h,home_team,"h",xG_home,xG_away,home_goals,away_goals,shots_h,shots_a,h_quality_shot_data_sums,a_quality_shot_data_sums,points_home,away_team,btts,blank,cs,first_h,first_a]],columns=['ID','Team','H/A','xG','xGC','G','GC','S','SC','QS','QSC','Pts','Opp','BTTS','Blank','CS','first_goal','first_conc'])
        # print(df2_1)
        df2_2 = pd.DataFrame([[game_id_a,away_team,"a",xG_away,xG_home,away_goals,home_goals,shots_a,shots_h,a_quality_shot_data_sums,h_quality_shot_data_sums,points_away,home_team,btts,cs,blank,first_a,first_h]],columns=['ID','Team','H/A','xG','xGC','G','GC','S','SC','QS','QSC','Pts','Opp','BTTS','Blank','CS','first_goal','first_conc'])
        # print(df2_2)
        df3 = pd.concat([df2_1,df2_2])
        ov_df = pd.concat([ov_df,df3])

    return ov_df          

In [155]:
eng_ov = sorted(eng22_23 + eng23_24)
eng_active = []
eng_count = 0

fra_ov = sorted(fra22_23 + fra23_24)
fra_active = []
fra_count = 0

ger_ov = sorted(ger22_23 + ger23_24)
ger_active = []
ger_count = 0

ita_ov = sorted(ita22_23 + ita23_24)
ita_active = []
ita_count = 0

spa_ov = sorted(spa22_23 + spa23_24)
spa_active = []
spa_count = 0

def id_list_update_380(ov,active,counter):
    while counter < 380:
        for item in reversed(ov):
            base_url ='https://understat.com/match/'
            url = base_url+str(item)
            res = requests.get(url)
            soup = BeautifulSoup(res.content, 'lxml')
            scripts = soup.find_all('script')
            if counter == 380:
                break
            else:
                if len(scripts) != 0:
                    active.append(item)
                    counter +=1
    return active

def id_list_update_306(ov,active,counter):
    while counter < 306:
        for item in reversed(ov):
            base_url ='https://understat.com/match/'
            url = base_url+str(item)
            res = requests.get(url)
            soup = BeautifulSoup(res.content, 'lxml')
            scripts = soup.find_all('script')
            if counter == 306:
                break
            else:
                if len(scripts) != 0:
                    active.append(item)
                    counter +=1
    return active

eng_active = id_list_update_380(eng_ov,eng_active,eng_count)
fra_active = id_list_update_380(fra_ov,fra_active,fra_count)
ger_active = id_list_update_306(ger_ov,ger_active,ger_count)
ita_active = id_list_update_380(ita_ov,ita_active,ita_count)
spa_active = id_list_update_380(spa_ov,spa_active,spa_count)

                
        

Exception ignored in: <function ZipFile.__del__ at 0x000001D7E65A50D0>
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python39\lib\zipfile.py", line 1807, in __del__
    self.close()
  File "c:\Users\User\AppData\Local\Programs\Python\Python39\lib\zipfile.py", line 1824, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


In [168]:
eng_df = matchstat_generator(eng_active, eng_df)
with pd.ExcelWriter("../Excel Dataframes/EPL_understat.xlsx") as writer:
    eng_df.to_excel(writer, sheet_name="gamesheet")
eng_df

Unnamed: 0,ID,Team,H/A,xG,xGC,G,GC,S,SC,QS,QSC,Pts,Opp,BTTS,Blank,CS,first_goal,first_conc
0,Newcastle United22304,Newcastle United,h,1.622,0.856,1,2,23,9,1,1,0,Liverpool,1,0,0,0,0
0,Liverpool22304,Liverpool,a,0.856,1.622,2,1,9,23,1,1,3,Newcastle United,1,0,0,0,0
0,Sheffield United22303,Sheffield United,h,0.762,3.848,1,2,6,30,1,4,0,Manchester City,1,0,0,0,0
0,Manchester City22303,Manchester City,a,3.848,0.762,2,1,30,6,4,1,3,Sheffield United,1,0,0,0,0
0,Burnley22302,Burnley,h,0.631,3.086,1,3,9,16,1,4,0,Aston Villa,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Manchester City18442,Manchester City,a,3.092,1.121,4,1,20,14,3,1,3,Bournemouth,1,0,0,0,0
0,Wolverhampton Wanderers18441,Wolverhampton Wanderers,h,0.911,1.153,0,1,15,5,0,1,0,Bournemouth,0,1,0,0,0
0,Bournemouth18441,Bournemouth,a,1.153,0.911,1,0,5,15,1,0,3,Wolverhampton Wanderers,0,0,1,0,0
0,Tottenham18440,Tottenham,h,1.432,0.365,2,0,16,6,3,0,3,West Ham,0,0,1,0,0


In [164]:
fra_df = matchstat_generator(fra_active, fra_df)
with pd.ExcelWriter("../Excel Dataframes/Ligue1_understat.xlsx") as writer:
    fra_df.to_excel(writer, sheet_name="gamesheet")
fra_df

Unnamed: 0,ID,Team,H/A,xG,xGC,G,GC,S,SC,QS,QSC,Pts,Opp,BTTS,Blank,CS,first_goal,first_conc
0,Le Havre23559,Le Havre,h,1.374,1.187,0,1,13,11,2,2,0,Rennes,0,1,0,0,0
0,Rennes23559,Rennes,a,1.187,1.374,1,0,11,13,2,2,3,Le Havre,0,0,1,0,0
0,Lens23558,Lens,h,1.920,1.681,3,1,10,19,3,2,3,Strasbourg,1,0,0,0,0
0,Strasbourg23558,Strasbourg,a,1.681,1.920,1,3,19,10,2,3,0,Lens,1,0,0,0,0
0,Clermont Foot23557,Clermont Foot,h,1.197,1.357,1,1,7,15,2,1,1,Brest,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Montpellier19839,Montpellier,a,0.787,1.785,2,0,4,16,1,2,3,Auxerre,0,0,1,0,0
0,Ajaccio19838,Ajaccio,h,1.474,2.027,0,2,16,8,2,3,0,Lyon,0,1,0,0,0
0,Lyon19838,Lyon,a,2.027,1.474,2,0,8,16,3,2,3,Ajaccio,0,0,1,0,0
0,Toulouse19837,Toulouse,h,1.643,0.755,1,1,16,17,3,0,1,Brest,1,0,0,0,0


In [165]:
ger_df = matchstat_generator(ger_active, ger_df)
with pd.ExcelWriter("../Excel Dataframes/Bundesliga_understat.xlsx") as writer:
    ger_df.to_excel(writer, sheet_name="gamesheet")
ger_df

Unnamed: 0,ID,Team,H/A,xG,xGC,G,GC,S,SC,QS,QSC,Pts,Opp,BTTS,Blank,CS,first_goal,first_conc
0,VfB Stuttgart23253,VfB Stuttgart,h,2.170,1.615,3,1,11,17,4,3,3,Mainz 05,1,0,0,0,0
0,Mainz 0523253,Mainz 05,a,1.615,2.170,1,3,17,11,3,4,0,VfB Stuttgart,1,0,0,0,0
0,Eintracht Frankfurt23252,Eintracht Frankfurt,h,2.442,1.277,1,1,9,14,4,2,1,Bochum,1,0,0,0,0
0,Bochum23252,Bochum,a,1.277,2.442,1,1,14,9,2,4,1,Eintracht Frankfurt,1,0,0,0,0
0,Augsburg23251,Augsburg,h,0.967,1.789,2,2,11,9,1,3,1,RasenBallsport Leipzig,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Bayer Leverkusen19533,Bayer Leverkusen,a,2.084,1.714,1,1,18,9,3,2,1,Freiburg,1,0,0,0,0
0,RasenBallsport Leipzig19532,RasenBallsport Leipzig,h,1.900,0.856,2,1,10,7,4,1,3,Eintracht Frankfurt,1,0,0,0,0
0,Eintracht Frankfurt19532,Eintracht Frankfurt,a,0.856,1.900,1,2,7,10,1,4,0,RasenBallsport Leipzig,1,0,0,0,0
0,Bayern Munich19531,Bayern Munich,h,3.864,0.213,3,0,20,3,6,0,3,Union Berlin,0,0,1,0,0


In [166]:
ita_df = matchstat_generator(ita_active, ita_df)
with pd.ExcelWriter("../Excel Dataframes/SerieA_understat.xlsx") as writer:
    ita_df.to_excel(writer, sheet_name="gamesheet")
ita_df

Unnamed: 0,ID,Team,H/A,xG,xGC,G,GC,S,SC,QS,QSC,Pts,Opp,BTTS,Blank,CS,first_goal,first_conc
0,Roma22544,Roma,h,1.009,1.242,2,4,9,10,1,2,0,Inter,1,0,0,0,0
0,Inter22544,Inter,a,1.242,1.009,4,2,10,9,2,1,3,Roma,1,0,0,0,0
0,Monza22543,Monza,h,0.864,0.577,0,0,11,10,1,0,1,Verona,0,1,1,0,0
0,Verona22543,Verona,a,0.577,0.864,0,0,10,11,0,1,1,Monza,0,1,1,0,0
0,Genoa22542,Genoa,h,0.916,3.162,1,4,15,14,1,5,0,Atalanta,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Spezia18821,Spezia,a,2.055,2.826,2,2,9,15,4,4,1,Udinese,1,0,0,0,0
0,Juventus18820,Juventus,h,1.520,1.006,4,2,13,8,2,1,3,Torino,1,0,0,0,0
0,Torino18820,Torino,a,1.006,1.520,2,4,8,13,1,2,0,Juventus,1,0,0,0,0
0,Verona18819,Verona,h,1.797,1.918,0,3,12,16,2,2,0,Fiorentina,0,1,0,0,0


In [167]:
spa_df = matchstat_generator(spa_active, spa_df)
with pd.ExcelWriter("../Excel Dataframes/LaLiga_understat.xlsx") as writer:
    spa_df.to_excel(writer, sheet_name="gamesheet")
spa_df

Unnamed: 0,ID,Team,H/A,xG,xGC,G,GC,S,SC,QS,QSC,Pts,Opp,BTTS,Blank,CS,first_goal,first_conc
0,Real Sociedad22924,Real Sociedad,h,2.511,0.705,0,1,21,9,4,1,0,Osasuna,0,1,0,0,0
0,Osasuna22924,Osasuna,a,0.705,2.511,1,0,9,21,1,4,3,Real Sociedad,0,0,1,0,0
0,Real Madrid22923,Real Madrid,h,3.412,0.180,4,0,16,5,4,0,3,Girona,0,0,1,0,0
0,Girona22923,Girona,a,0.180,3.412,0,4,5,16,0,4,0,Real Madrid,0,1,0,0,0
0,Getafe22922,Getafe,h,2.408,1.273,3,2,10,8,2,3,3,Celta Vigo,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Real Valladolid19204,Real Valladolid,a,0.247,2.803,1,1,5,24,0,5,1,Elche,1,0,0,0,0
0,Celta Vigo19203,Celta Vigo,h,2.138,0.697,3,0,15,13,4,0,3,Rayo Vallecano,0,0,1,0,0
0,Rayo Vallecano19203,Rayo Vallecano,a,0.697,2.138,0,3,13,15,0,4,0,Celta Vigo,0,1,0,0,0
0,Athletic Club19202,Athletic Club,h,1.153,0.725,0,1,15,10,2,1,0,Barcelona,0,1,0,0,0
