# Team Data Scrape

We now scrape for team data

In [39]:
from bs4 import BeautifulSoup
import requests
import pandas as pd 
import os
import time

In [28]:
def get_url_final(code, year_range, team):
    base_url = 'https://fbref.com/en/squads/{}/{}/matchlogs/c9/schedule/{}-Scores-and-Fixtures-Premier-League'
    url = base_url.format(code, year_range, team)
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'lxml')
    html_filtered = soup.find('tbody')
    return(html_filtered)

In [7]:
#updated functions
def get_dat(dat, col):
    coldat = []
    for row in dat.find_all('tr'):
        if not row.attrs:
            coldat.append(row.find('td', {'data-stat': col}).text)
    return (coldat)   

def get_date(dat, col):
    coldat = []
    for row in dat.find_all('tr'):
        if not row.attrs:
            coldat.append(row.find('th', {'data-stat': col}).text)
    return (coldat)   

def get_matchweek(dat, col):
    coldat = []
    for row in dat.find_all('tr'):
        if not row.attrs:
            coldat.append(row.find('td', {'data-stat': col}).text)
    coldat = [coldat.replace('Matchweek ', '') for coldat in coldat]
    return(coldat)

In [29]:
def get_data(code, year_range, team):
    
    data_summary = get_url_final(code, year_range, team)
    
    date = get_date(data_summary, 'date')
    time = get_dat(data_summary,'start_time')
    matchweek = get_matchweek(data_summary, 'round')
    day = get_dat(data_summary, 'dayofweek')
    venue = get_dat(data_summary, 'venue')
    result = get_dat(data_summary, 'result')
    gf = get_dat(data_summary, 'goals_for')
    ga = get_dat(data_summary, 'goals_against')
    opponent = get_dat(data_summary, 'opponent')
    xG = get_dat(data_summary, 'xg_for')
    xGA = get_dat(data_summary, 'xg_against')
    possession = get_dat(data_summary, 'possession')
    attendance = get_dat(data_summary, 'attendance')
    captain = get_dat(data_summary, 'captain')
    formation = get_dat(data_summary, 'formation')
    opp_formation = get_dat(data_summary, 'opp_formation')
    referee = get_dat(data_summary, 'referee')
    
    #create dataframe
    df = pd.DataFrame({
        'Date': date, 
        'Time': time, 
        'Matchweek': matchweek, 
        'Day': day, 
        'Venue': venue, 
        'Result': result, 
        'Goals Scored': gf, 
        'Goals Conceded': ga, 
        'Opponent': opponent, 
        'xG': xG, 
        'xGA': xGA, 
        'Possession': possession, 
        'Attendance': attendance, 
        'Captain': captain, 
        'Formation': formation, 
        'Opposition Formation': opp_formation, 
        'Referee': referee
    })
    return (df)
    

In [34]:
test = get_data('822bd0ba', '2023-2024', 'Liverpool')

Now we make compile dat function, so we can get the data for all teams in one swoop. 

In [55]:
def compile_dat (code, team):
    #these are the seasons for which we have data 
    year_list = ('2023-2024', '2022-2023', '2021-2022', '2020-2021', '2019-2020', '2018-2019', '2017-2018')
    
    #create an empty dictionary to store all of the dataframes 
    dataframes = {}
    
    #for each season a player was active, the get_data function from above is used to add the data for that season to the dataframes dictionary
    for year in year_list: 
        if get_url_final(code, year, team) is None:
            continue
        else:   
            dataframes[year] = get_data(code, year, team)
            #1 minute pause in between each iteration, to ensure that we don't get banned from FBref
            time.sleep(60)
    
    #concatenate all of the dataframes, and return one final concatenated dataframe
    finaldf = pd.concat(dataframes.values(), join = "inner", ignore_index = True)
    return(finaldf)

In [40]:
liverpool_teamdat = compile_dat('822bd0ba', 'Liverpool')

So the code above works to get data for teams that have been in the league from 2017 until 2024. However, it won't work if we want to get data for teams that have only been in the league for some of the seasons that we have data for. Because of this, we need to build this into the code. 

In [56]:
#we have edited the complie_dat function to include a conditional that checks whether or not the url is None (it will be None if the team wasn't 
# in the premier league in for that particular season). we will check whether or not this worked by getting data for Nottingham Forest (who were
# not in the league for all 7 seasons)

nfo_teamdat = compile_dat('e4a775cb', 'Nottingham Forest')

In [None]:
#we see that the code above worked. we now need to make a folder to store these csv files 

nest_folder_team = ("Team_Data")
os.makedirs(nest_folder_team, exist_ok = True)

if isinstance(liverpool_teamdat, pd.DataFrame):
    liverpool_teamdat.to_csv(os.path.join(nest_folder_team, "liverpool_teamdat.csv"))
    
if isinstance(nfo_teamdat, pd.DataFrame):
    nfo_teamdat.to_csv(os.path.join(nest_folder_team, "nfo_teamdat.csv"))

In [61]:
arsenal_teamdat = compile_dat('18bb7c10', 'Arsenal')
if isinstance(arsenal_teamdat, pd.DataFrame):
    arsenal_teamdat.to_csv(os.path.join(nest_folder_team, "arsenal_teamdat.csv"))

In [62]:
astonvilla_teamdat = compile_dat('8602292d', 'Aston Villa')
if isinstance(astonvilla_teamdat, pd.DataFrame):
    astonvilla_teamdat.to_csv(os.path.join(nest_folder_team, "astonvilla_teamdat.csv"))

In [63]:
bournemouth_teamdat = compile_dat('4ba7cbea', 'Bournemouth')
if isinstance(bournemouth_teamdat, pd.DataFrame):
    bournemouth_teamdat.to_csv(os.path.join(nest_folder_team, "bournemouth_teamdat.csv"))

In [64]:
bha_teamdat = compile_dat('d07537b9', 'Brighton and Hove Albion')
if isinstance(bha_teamdat, pd.DataFrame):
    bha_teamdat.to_csv(os.path.join(nest_folder_team, "bha_teamdat.csv"))

In [65]:
burnley_teamdat = compile_dat('943e8050', 'Burnley')
if isinstance(burnley_teamdat, pd.DataFrame):
    burnley_teamdat.to_csv(os.path.join(nest_folder_team, "burnley_teamdat.csv"))

In [66]:
chelsea_teamdat = compile_dat('cff3d9bb', 'Chelsea')
if isinstance(chelsea_teamdat, pd.DataFrame):
    chelsea_teamdat.to_csv(os.path.join(nest_folder_team, "chelsea_teamdat.csv"))

In [67]:
cpa_teamdat = compile_dat('47c64c55', 'Crystal Palace')
if isinstance(cpa_teamdat, pd.DataFrame):
    cpa_teamdat.to_csv(os.path.join(nest_folder_team, "cpa_teamdat.csv"))

In [68]:
everton_teamdat = compile_dat('d3fd31cc', 'Everton')
if isinstance(everton_teamdat, pd.DataFrame):
    everton_teamdat.to_csv(os.path.join(nest_folder_team, "everton_teamdat.csv"))

In [69]:
huddersfield_teamdat = compile_dat('f5922ca5', 'Huddersfield Town')
if isinstance(huddersfield_teamdat, pd.DataFrame):
    huddersfield_teamdat.to_csv(os.path.join(nest_folder_team, "huddersfield_teamdat.csv"))

In [70]:
leicester_teamdat = compile_dat('a2d435b3', 'Leicester City')
if isinstance(leicester_teamdat, pd.DataFrame):
    leicester_teamdat.to_csv(os.path.join(nest_folder_team, "leicester_teamdat.csv"))

In [71]:
mancity_teamdat = compile_dat('b8fd03ef', 'Manchester City')
if isinstance(mancity_teamdat, pd.DataFrame):
    mancity_teamdat.to_csv(os.path.join(nest_folder_team, "mancity_teamdat.csv"))

In [72]:
manunited_teamdat = compile_dat('19538871', 'Manchester United')
if isinstance(manunited_teamdat, pd.DataFrame):
    manunited_teamdat.to_csv(os.path.join(nest_folder_team, "manunited_teamdat.csv"))

In [73]:
newcastle_teamdat = compile_dat('b2b47a98', 'Newcastle United')
if isinstance(newcastle_teamdat, pd.DataFrame):
    newcastle_teamdat.to_csv(os.path.join(nest_folder_team, "newcastle_teamdat.csv"))

In [74]:
southampton_teamdat = compile_dat('33c895d4', 'Southampton')
if isinstance(southampton_teamdat, pd.DataFrame):
    southampton_teamdat.to_csv(os.path.join(nest_folder_team, "southampton_teamdat.csv"))

In [75]:
stoke_teamdat = compile_dat('17892952', 'Stoke City')
if isinstance(stoke_teamdat, pd.DataFrame):
    stoke_teamdat.to_csv(os.path.join(nest_folder_team, "stoke_teamdat.csv"))

In [76]:
swansea_teamdat = compile_dat('fb10988f', 'Swansea City')
if isinstance(swansea_teamdat, pd.DataFrame):
    swansea_teamdat.to_csv(os.path.join(nest_folder_team, "swansea_teamdat.csv"))

In [77]:
spurs_teamdat = compile_dat('361ca564', 'Tottenham Hotspur')
if isinstance(spurs_teamdat, pd.DataFrame):
    spurs_teamdat.to_csv(os.path.join(nest_folder_team, "spurs_teamdat.csv"))

In [78]:
watford_teamdat = compile_dat('2abfe087', 'Watford')
if isinstance(watford_teamdat, pd.DataFrame):
    watford_teamdat.to_csv(os.path.join(nest_folder_team, "watford_teamdat.csv"))

In [79]:
westbrom_teamdat = compile_dat('60c6b05f', 'West Bromwich Albion')
if isinstance(westbrom_teamdat, pd.DataFrame):
    westbrom_teamdat.to_csv(os.path.join(nest_folder_team, "westbrom_teamdat.csv"))

In [80]:
westham_teamdat = compile_dat('7c21e445', 'West Ham United')
if isinstance(westham_teamdat, pd.DataFrame):
    westham_teamdat.to_csv(os.path.join(nest_folder_team, "westham_teamdat.csv"))

In [81]:
wolves_teamdat = compile_dat('8cec06e1', 'Wolverhampton Wanderers')
if isinstance(wolves_teamdat, pd.DataFrame):
    wolves_teamdat.to_csv(os.path.join(nest_folder_team, "wolves_teamdat.csv"))

In [82]:
fulham_teamdat = compile_dat('fd962109', 'Fulham')
if isinstance(fulham_teamdat, pd.DataFrame):
    fulham_teamdat.to_csv(os.path.join(nest_folder_team, "fulham_teamdat.csv"))

In [83]:
cardiff_teamdat = compile_dat('75fae011', 'Cardiff-City')
if isinstance(cardiff_teamdat, pd.DataFrame):
    cardiff_teamdat.to_csv(os.path.join(nest_folder_team, "cardiff_teamdat.csv"))

In [84]:
norwich_teamdat = compile_dat('1c781004', 'Norwich City')
if isinstance(norwich_teamdat, pd.DataFrame):
    norwich_teamdat.to_csv(os.path.join(nest_folder_team, "norwich_teamdat.csv"))

In [85]:
sheffield_teamdat = compile_dat('1df6b87e', 'Sheffield United')
if isinstance(sheffield_teamdat, pd.DataFrame):
    sheffield_teamdat.to_csv(os.path.join(nest_folder_team, "sheffield_teamdat.csv"))

In [86]:
leeds_teamdat = compile_dat('5bfb9659', 'Leeds United')
if isinstance(leeds_teamdat, pd.DataFrame):
    leeds_teamdat.to_csv(os.path.join(nest_folder_team, "leeds_teamdat.csv"))

In [88]:
brentford_teamdat = compile_dat('cd051869', 'Brentford')
if isinstance(brentford_teamdat, pd.DataFrame):
    brentford_teamdat.to_csv(os.path.join(nest_folder_team, "brentford_teamdat.csv"))

In [89]:
luton_teamdat = compile_dat('e297cd13', 'Luton Town')
if isinstance(luton_teamdat, pd.DataFrame):
    luton_teamdat.to_csv(os.path.join(nest_folder_team, "luton_teamdat.csv"))

Now, we need to compile all of the csv files into one

In [90]:
team_path = 'Team_Data'

team_dataframe = []

if os.path.exists(team_path):
    for file in os.listdir(team_path):
        if file.endswith('.csv'):
            file_path = os.path.join(team_path, file)
            df = pd.read_csv(file_path)
            team_dataframe.append(df) #append dataframe to final df
            
    final_df = pd.concat(team_dataframe, ignore_index=
                         True)

final_df.to_csv(os.path.join("", "team_finaldat.csv"))

In [101]:
#now we load in team_finaldat 
team_finaldat = df = pd.read_csv('team_finaldat.csv', index_col = 0)

In [102]:
team_finaldat.head()

Unnamed: 0.1,Unnamed: 0,Date,Time,Matchweek,Day,Venue,Result,Goals Scored,Goals Conceded,Opponent,xG,xGA,Possession,Attendance,Captain,Formation,Opposition Formation,Referee
0,0,2023-08-12,15:00,1,Sat,Away,D,1,1,Bournemouth,1.1,1.3,38,11245,Kurt Zouma,4-2-3-1,4-2-3-1,Peter Bankes
1,1,2023-08-20,16:30,2,Sun,Home,W,3,1,Chelsea,1.8,2.5,25,62451,Kurt Zouma,4-2-3-1,3-4-3,John Brooks
2,2,2023-08-26,17:30,3,Sat,Away,W,3,1,Brighton,3.0,1.5,22,31508,Kurt Zouma,4-1-4-1,4-2-3-1,Anthony Taylor
3,3,2023-09-01,20:00,4,Fri,Away,W,2,1,Luton Town,1.0,1.4,61,10802,Kurt Zouma,4-2-3-1,5-3-2,Paul Tierney
4,4,2023-09-16,15:00,5,Sat,Home,L,1,3,Manchester City,0.9,3.6,32,62475,Kurt Zouma,4-2-3-1,4-2-3-1,Andy Madley


In [None]:
#get rid of the first column 
team_finaldat = team_finaldat.drop(columns=['Unnamed: 0'])

In [106]:
#now we want to double check that the dataframe has the correct number of rows. 
team_finaldat.shape[0]

#so the dataframe has 5320 rows, we know each season has 380 games. we have 7 seasons worth of data. this means that there are 2660 
# games worth of data. However, each game is double counted, because there are observations that correspond to one team being the 'Team' and the other
#team being the 'Opponent'. Therefore, the number of observations in the dataframe makes sense

5320

We also notice that there is no column in the dataframe corresponding to the team. We should add this in to make it easier to integrate this data back into the player data we have. 

In [None]:
#get the order in which the files were appended to the final dataframe 
os.listdir(team_path)

['westham_teamdat.csv',
 'norwich_teamdat.csv',
 'everton_teamdat.csv',
 'fulham_teamdat.csv',
 'nfo_teamdat.csv',
 'cpa_teamdat.csv',
 'burnley_teamdat.csv',
 'southampton_teamdat.csv',
 'stoke_teamdat.csv',
 'cardiff_teamdat.csv',
 'newcastle_teamdat.csv',
 'brentford_teamdat.csv',
 'astonvilla_teamdat.csv',
 'westbrom_teamdat.csv',
 'liverpool_teamdat.csv',
 'wolves_teamdat.csv',
 'huddersfield_teamdat.csv',
 'chelsea_teamdat.csv',
 'watford_teamdat.csv',
 'spurs_teamdat.csv',
 'bournemouth_teamdat.csv',
 'leicester_teamdat.csv',
 'leeds_teamdat.csv',
 'mancity_teamdat.csv',
 'bha_teamdat.csv',
 'manunited_teamdat.csv',
 'luton_teamdat.csv',
 'sheffield_teamdat.csv',
 'swansea_teamdat.csv',
 'arsenal_teamdat.csv']

In [133]:
#get the number of rows in each of the csv files above 
team_rows = []
for row in os.listdir(team_path):
    df = pd.read_csv(os.path.join(team_path, row))
    team_rows.append(df.shape[0])

#get the teams list from above, and modify it so that it contains the actual team names we want to input as values in the 'Team' feature
team_list = ['West Ham', 'Norwich City', 'Everton', 'Fulham', "Nott'ham Forest", 'Crystal Palace', 'Burnley', 'Southampton', 'Stoke City',
             'Cardiff City', 'Newcastle Utd', 'Brentford', 'Aston Villa', 'West Brom', 'Liverpool', 'Wolves', 'Huddersfield', 'Chelsea', 'Watford', 
             'Tottenham', 'Bournemouth', 'Leicester City', 'Leeds United', 'Manchester City', 'Brighton', 'Manchester Utd', 'Luton Town', 
             'Sheffield Utd', 'Swansea City', 'Arsenal']

#now we construct the 'Team' feature 
team_finaldat['Team'] = None

#index for filling
start_index = 0

#loop through team_list and team_rows
for team, n in zip(team_list, team_rows):
    #we fill in team for the first n rows, where n is the first number in the team_rows list 
    team_finaldat.loc[start_index:start_index + n - 1, 'Team'] = team
    #update start index
    start_index += n

In [136]:
#we now export the final version with 'Team' feature included
team_finaldat.to_csv(os.path.join("", "team_finaldat.csv"))