In [159]:
import pandas as pd
import numpy as np
import requests
import time
from io import StringIO
from bs4 import BeautifulSoup

FBREF Data

In [147]:
url = "https://fbref.com/en/comps/9/2023-2024/stats/2023-2024-Premier-League-Stats"
with requests.Session() as session:
    html = session.get(url).text.replace("<!--", "").replace("-->", "")

soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", {"id": "stats_standard"})
ids = [x["data-append-csv"] for x in table.find_all("td", {"data-stat": "player"})]

df = pd.read_html(StringIO(str(table)))[0]
df.columns = [f"{i} {j}" if "Unnamed" not in i else j for i, j in df.columns]
df = df.loc[df["Rk"] != "Rk"]
df["IDs"] = ids

# df.to_csv("fbref.csv", encoding="utf-8", index=False)

   Rk              Player   Nation    Pos            Squad Age  Born  \
0   1          Max Aarons  eng ENG     DF      Bournemouth  23  2000   
1   2   Joshua Acheampong  eng ENG     DF          Chelsea  17  2006   
2   3  Bénie Adama Traore   ci CIV  FW,MF    Sheffield Utd  20  2002   
3   4         Tyler Adams   us USA     MF      Bournemouth  24  1999   
4   5    Tosin Adarabioyo  eng ENG     DF           Fulham  25  1997   
5   6      Elijah Adebayo  eng ENG     FW       Luton Town  25  1998   
6   7       Simon Adingra   ci CIV     FW         Brighton  21  2002   
7   8        Nayef Aguerd   ma MAR     DF         West Ham  27  1996   
8   9    Brandon Aguilera   cr CRC     FW  Nott'ham Forest  20  2003   
9  10    Naouirou Ahamada   fr FRA  MF,FW   Crystal Palace  21  2002   

  Playing Time MP Playing Time Starts Playing Time Min  ...  \
0              20                  13             1237  ...   
1               1                   0                6  ...   
2               8 

In [148]:
player_csv_file_path = "C:/Users/erknud3/fpl-optimization/model/data/player_season_data.csv"
teams_csv_file_path = "C:/Users/erknud3/fpl-optimization/model/data/team_season_data.csv"

df.to_csv(player_csv_file_path, index=False)
#teams.to_csv(teams_csv_file_path, index=False)

FPL Data

In [166]:
r = requests.get("https://fantasy.premierleague.com/api/bootstrap-static/")
fpl_data = r.json()
elements = pd.DataFrame(fpl_data["elements"])

In [167]:
#elements['element_type'].unique()
elements = elements[['id', 'web_name', 'element_type', 'now_cost', 'selected_by_percent']]

In [168]:
elements.loc[elements.element_type == 1, 'position'] ='GKP'
elements.loc[elements.element_type == 2, 'position'] ='DEF'
elements.loc[elements.element_type == 3, 'position'] ='MID'
elements.loc[elements.element_type == 4, 'position'] ='FWD'

In [170]:
elements = elements.rename({"selected_by_percent": "tsb"}, axis=1)

In [171]:
fpl_csv_file_path = "C:/Users/erknud3/fpl-optimization/model/data/fpl_data.csv"
elements.to_csv(fpl_csv_file_path, index=False)

Season Data

In [108]:
def fetch_data(url, table_id):
    start_time = time.time()
    
    response = requests.get(url)
    html_text = response.text.replace('<!--', '').replace('-->', '')
    df = pd.read_html(StringIO(html_text), attrs={'id': table_id}, index_col=[0])[0]
    df = df.reset_index().rename(columns={"index": "ID"})
    end_time = time.time()
    print(f"Done fetching FBREF data from table_id: {table_id} in {end_time - start_time:.2f} seconds")
    
    return df

In [109]:
# URLs and table IDs
url = "https://fbref.com/en/comps/9/2023-2024/stats/2023-2024-Premier-League-Stats"

# Fetch and clean dataframes from URLs
players = fetch_data(url, "stats_standard")
teams = fetch_data(url, "stats_squads_standard_against")

Done fetching FBREF data from table_id: stats_standard in 0.38 seconds
Done fetching FBREF data from table_id: stats_squads_standard_against in 0.38 seconds


In [115]:
page = requests.get(url)
html_text = page.text.replace('<!--', '').replace('-->', '')

In [119]:
soup = BeautifulSoup(html_text, 'html.parser')

In [131]:
data = soup.select('table[id="stats_standard"]')

In [91]:
players = players[players['ID'] != 'Rk']

In [92]:
players = players[['Unnamed: 1_level_0', 'Playing Time', 'Per 90 Minutes']]
teams = teams[['ID', 'Per 90 Minutes']]

In [93]:
players.columns = players.columns.get_level_values(1)
teams.columns = teams.columns.get_level_values(1)

In [94]:
player_csv_file_path = "C:/Users/erknud3/fpl-optimization/model/data/player_season_data.csv"
teams_csv_file_path = "C:/Users/erknud3/fpl-optimization/model/data/team_season_data.csv"

players.to_csv(player_csv_file_path, index=False)
teams.to_csv(teams_csv_file_path, index=False)

Match Logs Data

In [None]:
url = "https://fbref.com/en/players/bc7dc64d/matchlogs/2023-2024/c9/Bukayo-Saka-Match-Logs"

In [7]:
df = fetch_data(url, "matchlogs_2023-2024_9")

Done fetching FBREF data from table_id: matchlogs_2023-2024_9 in 0.10 seconds


  df = pd.read_html(html_text, attrs={'id': table_id}, index_col=[0])[0]


In [9]:
df = df.dropna()

In [17]:
df = df[['Round', 'Start', 'Min', 'Gls', 'Ast', 'PK', 'xG', 'npxG', 'xAG']]
df
#df.sort_values(by=['Date'])

Date,Round,Start,Min,Gls,Ast,PK,xG,npxG,xAG
2023-08-12,Matchweek 1,Y,90.0,1.0,0.0,0.0,0.2,0.2,0.1
2023-08-21,Matchweek 2,Y,88.0,0.0,0.0,0.0,0.2,0.2,0.2
2023-08-26,Matchweek 3,Y,90.0,1.0,0.0,1.0,1.4,0.6,0.5
2023-09-03,Matchweek 4,Y,90.0,0.0,1.0,0.0,0.4,0.4,0.4
2023-09-17,Matchweek 5,Y,90.0,0.0,1.0,0.0,0.0,0.0,0.1
2023-09-24,Matchweek 6,Y,89.0,1.0,0.0,1.0,0.9,0.1,0.5
2023-09-30,Matchweek 7,Y,75.0,1.0,0.0,0.0,1.0,1.0,0.0
2023-10-21,Matchweek 9,Y,90.0,0.0,1.0,0.0,0.0,0.0,0.3
2023-10-28,Matchweek 10,Y*,72.0,0.0,0.0,0.0,0.0,0.0,0.1
2023-11-04,Matchweek 11,Y,90.0,0.0,0.0,0.0,0.0,0.0,0.0
