In [4]:
import json
import os
import pickle
import requests
from thefuzz import fuzz
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt



# Scraping Data
Scrape the data from understat for players and teams for the 2022/2023 season.


In [None]:
BASE_LEAGUE_URL = "https://understat.com/league/EPL/"
BASE_TEAM_URL = "https://understat.com/team/"
teams_list = ['Manchester_City', 'Arsenal','Manchester_United','Newcastle_United','Liverpool','Brighton','Aston_Villa', 'Tottenham','Brentford','Fulham','Crystal_Palace', 'Chelsea','Wolverhampton_Wanderers','West_Ham','Bournemouth','Nottingham_Forest','Everton', 'Leeds', 'Leicester', 'Southampton'   ]


def scrape_player_tags_season(season: str):
    """
    Takes a string with a league/season combination, e.g., "EPL_2020"
    to define a URL to be scraped.
    Returns a ResultSet object, class implemented by BeautifulSoup, with all
    the script tags in the URL.
    """
    URL = BASE_LEAGUE_URL + season
    response = requests.get(URL)
    print(response)
    soup = BeautifulSoup(response.content, "lxml")
    soup_scripts = soup.find_all("script")
    return soup_scripts

def scrape_team_tags_season(season: str, team: str):
    """
    Takes a string with a league/season combination, e.g., "EPL_2020"
    to define a URL to be scraped.
    Returns a ResultSet object, class implemented by BeautifulSoup, with all
    the script tags in the URL.
    """
    URL = BASE_TEAM_URL + team + '/' + season
    response = requests.get(URL)
    print(response)
    soup = BeautifulSoup(response.content, "lxml")
    soup_scripts = soup.find_all("script")
    return soup_scripts

def scrape_all_teams(season: str, teams: list):
    """
    Takes a string with a league/season combination, e.g., "EPL_2020"
    to define a URL to be scraped.
    Returns a ResultSet object, class implemented by BeautifulSoup, with all
    the script tags in the URL.
    """
    team_dict = {}
    for team in teams:
        team_tags = scrape_team_tags_season(season, team)
        team_dict[team] = generate_team_dict(team_tags)
    return team_dict

all_team_df = pd.DataFrame().from_dict(scrape_all_teams('2022', teams_list))
all_team_df = all_team_df.transpose()
print(all_team_df['situation'][0])

player_tags = scrape_player_tags_season("2022")

# Parse the data
Parse the data into a dataframe for players and teams for the 2022/2023 season.

In [None]:
def generate_players_dict(data):
    
    script = data[3].string
    start_index = script.index("('")+2
    end_index = script.index("')")
    json_string = script[start_index:end_index]
    json_string = json_string.encode("utf8").decode("unicode_escape")
    players_dict = json.loads(json_string)
    return players_dict

def generate_team_dict(data):
    
    script = data[2].string
    start_index = script.index("('")+2
    end_index = script.index("')")
    json_string = script[start_index:end_index]
    json_string = json_string.encode("utf8").decode("unicode_escape")
    team_dict = json.loads(json_string)
    return team_dict

def generate_team_df(all_team_df):
    situations = ['OpenPlay','FromCorner','SetPiece','DirectFreekick', 'Penalty']
    stats = ['shots', 'goals', 'xG']
    stats_dict = {}
    for team in all_team_df.transpose():
        stats_dict[team] = {'shots': 0, 'goals': 0, 'xG': 0, 'against_shots': 0, 'against_goals': 0, 'against_xG': 0}
        for situation in situations:
            for stat in stats:
                stats_dict[team][stat] += all_team_df['situation'][team][situation][stat]
                stats_dict[team]['against_' + stat] += all_team_df['situation'][team][situation]['against'][stat]
    
    team_df = pd.DataFrame().from_dict(stats_dict)
    team_df = team_df.transpose()
    return team_df

# Export and save the data

In [None]:
player_df = pd.DataFrame().from_dict(generate_players_dict(player_tags))
team_df = generate_team_df(all_team_df)

player_df.to_csv('player_overall_2022.csv')
team_df.to_csv('teams_overall_2022.csv')


# Format the data

Several columns in fpl_all_gw.csv need to be dropped. 
Add the corresponding player stats from player_overall_2022.csv and the corresponding opponnent stats from team_overall_2022.csv


In [61]:
player_df = pd.read_csv('player_overall_2022.csv')
team_df = pd.read_csv('teams_overall_2022.csv')
fpl_df = pd.read_csv('fpl_all_gw.csv')
id_df = pd.read_csv('team_ids.csv')

# Drop unwanted columns from the FPL dataframe
cols = fpl_df.columns.to_list()
wanted_cols = ['name', 'position', 'team', 'total_points','GW', 'opponent_team', 'bps']

for i in range(len(cols)):
    if cols[i] not in wanted_cols:
        fpl_df = fpl_df.drop(cols[i], axis=1)


# Replace FPL team names with Understat team names
ids = dict(zip(id_df['id'], id_df['team']))

for team in ids:
    fpl_df.replace({'opponent_team': team}, ids[team], inplace=True)



In [62]:
# Create a new dataframe with all the data we want to train on
training_df = pd.DataFrame()
fpl_cols = fpl_df.columns.to_list()
player_cols = player_df.columns.to_list()
team_cols = team_df.columns.to_list()

# Add the FPL data to the training dataframe
for i in range(len(fpl_cols)):
    training_df[fpl_cols[i]] = fpl_df[fpl_cols[i]]

# Create columns for the player data
for i in range(len(player_cols)):
    training_df[player_cols[i]] = 0

# Add the player data to the training dataframe for the corresponding player
# This is probably the least efficient way to do this but I'm tired and it worked
for i in range(len(training_df['name'])):
    for j in range(len(player_df['player_name'])):
        if fuzz.ratio(training_df['name'][i], player_df['player_name'][j]) > 90 and training_df['position'][i] != 'GK':
            for k in range(len(player_cols)):
                training_df[player_cols[k]][i] = player_df[player_cols[k]][j]

training_df.head(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df[player_cols[k]][i] = player_df[player_cols[k]][j]


Unnamed: 0.1,name,position,team,bps,opponent_team,total_points,GW,Unnamed: 0,id,player_name,...,xA,shots,key_passes,yellow_cards,red_cards,team_title,npg,npxG,xGChain,xGBuildup
0,Nathan Redmond,S,Southampton,3,Tottenham,1,1,308,790,Nathan Redmond,...,0.0,0,0,0,0,Southampton,0,0.0,0.0,0.0
1,Junior Stanislas,S,Bournemouth,3,Aston_Villa,1,1,275,463,Junior Stanislas,...,0.084641,1,2,0,0,Bournemouth,0,0.026749,0.259928,0.219473
2,Armando Broja,F S,Chelsea,3,Everton,1,1,245,8384,Armando Broja,...,0.086674,6,1,0,0,Chelsea,1,0.857246,0.897836,0.0
3,Fabian Schär,D,Newcastle,43,Nottingham_Forest,15,1,170,76,Fabian Schär,...,2.429651,56,19,7,0,Newcastle United,1,5.092751,10.184463,8.898999
4,Jonny Evans,D S,Leicester,15,Brentford,1,1,309,807,Jonny Evans,...,0.092742,1,1,1,0,Leicester,0,0.084322,3.380403,3.287661
5,Brennan Johnson,F M S,Nott'm Forest,3,Newcastle_United,2,1,32,10760,Brennan Johnson,...,3.743296,55,28,6,0,Nottingham Forest,7,5.983012,12.170722,2.800761
6,Cheick Doucouré,0,Crystal Palace,16,Arsenal,2,1,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0.0
7,Oliver Hammond,0,Nott'm Forest,0,Newcastle_United,0,1,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0.0
8,Luke Cundle,0,Wolves,0,Leeds,0,1,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0.0
9,Fin Stevens,0,Brentford,0,Leicester,0,1,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0.0


In [63]:
# Rename the columns to not interfere with fpl_df columns
team_df = team_df.rename(columns={'team': 'player_team', 'shots': 'team_shots', 'goals': 'team_goals', 'xG': 'team_xG', 'against_shots': 'team_against_shots', 'against_goals': 'team_against_goals', 'against_xG': 'team_against_xG'})
team_cols = team_df.columns.to_list()

# Create columns for the team data
for i in range(len(team_cols)):
    training_df[team_cols[i]] = 0

# Add the team data to the training dataframe for the corresponding opponent team
for i in range(len(training_df['total_points'])):
    for j in range(len(team_df['team_shots'])):
        if training_df['team'][i] == team_df['player_team'][j]:
            for k in range(len(team_cols)):
                training_df[team_cols[k]][i] = team_df[team_cols[k]][j]

team_df = team_df.rename(columns={'player_team': 'opp_team', 'team_shots': 'opp_shots', 'team_goals': 'opp_goals', 'team_xG': 'opp_xG', 'team_against_shots': 'opp_against_shots', 'team_against_goals': 'opp_against_goals', 'team_against_xG': 'opp_against_xG'})
team_cols = team_df.columns.to_list()

# Create columns for the team data
for i in range(len(team_cols)):
    training_df[team_cols[i]] = 0

# Add the team data to the training dataframe for the corresponding opponent team
for i in range(len(training_df['total_points'])):
    for j in range(len(team_df['opp_team'])):
        if training_df['opponent_team'][i] == team_df['opp_team'][j]:
            for k in range(len(team_cols)):
                training_df[team_cols[k]][i] = team_df[team_cols[k]][j]

training_df.head(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df[team_cols[k]][i] = team_df[team_cols[k]][j]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df[team_cols[k]][i] = team_df[team_cols[k]][j]


Unnamed: 0.1,name,position,team,bps,opponent_team,total_points,GW,Unnamed: 0,id,player_name,...,team_against_shots,team_against_goals,team_against_xG,opp_team,opp_shots,opp_goals,opp_xG,opp_against_shots,opp_against_goals,opp_against_xG
0,Nathan Redmond,S,Southampton,3,Tottenham,1,1,308,790,Nathan Redmond,...,530,73,69.86163,Tottenham,522,70,60.43707,520,63,53.026401
1,Junior Stanislas,S,Bournemouth,3,Aston_Villa,1,1,275,463,Junior Stanislas,...,627,71,72.094952,Aston_Villa,432,51,53.065132,433,46,59.011824
2,Armando Broja,F S,Chelsea,3,Everton,1,1,245,8384,Armando Broja,...,441,47,58.295355,Everton,430,34,51.041358,573,57,69.68157
3,Fabian Schär,D,Newcastle,43,Nottingham_Forest,15,1,170,76,Fabian Schär,...,0,0,0.0,Nottingham_Forest,370,38,42.292273,557,68,69.678757
4,Jonny Evans,D S,Leicester,15,Brentford,1,1,309,807,Jonny Evans,...,578,68,65.143198,Brentford,410,58,62.640523,562,46,55.609945
5,Brennan Johnson,F M S,Nott'm Forest,3,Newcastle_United,2,1,32,10760,Brennan Johnson,...,0,0,0.0,Newcastle_United,575,68,83.387276,390,33,44.095025
6,Cheick Doucouré,0,Crystal Palace,16,Arsenal,2,1,0,0,0,...,0,0,0.0,Arsenal,596,88,81.363067,342,43,46.439517
7,Oliver Hammond,0,Nott'm Forest,0,Newcastle_United,0,1,0,0,0,...,0,0,0.0,Newcastle_United,575,68,83.387276,390,33,44.095025
8,Luke Cundle,0,Wolves,0,Leeds,0,1,0,0,0,...,0,0,0.0,Leeds,464,48,51.998012,529,78,71.887911
9,Fin Stevens,0,Brentford,0,Leicester,0,1,0,0,0,...,562,46,55.609945,Leicester,419,51,51.532965,578,68,65.143198


In [48]:
# Drop the columns we don't want to train on
cols_to_drop = ['name', 'position', 'team', 'opponent_team','GW', 'Unnamed: 0', 'id', 'player_name','team_title','yellow_cards', 'red_cards','opp_team', 'player_team']

for i in range(len(cols_to_drop)):
    training_df = training_df.drop(cols_to_drop[i], axis=1)

# Drop the rows where there is no data
for i in range(len(training_df['time'])):
    if training_df['time'][i] / training_df['games'][i] < 60:
        training_df = training_df.drop(i, axis=0)
        continue
    if training_df['xG'][i] == 0 and training_df['xA'][i] == 0:
        training_df = training_df.drop(i, axis=0)

training_df.info()


  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_d

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7667 entries, 3 to 26504
Data columns (total 26 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bps                 7667 non-null   int64  
 1   total_points        7667 non-null   int64  
 2   games               7667 non-null   int64  
 3   time                7667 non-null   int64  
 4   goals               7667 non-null   int64  
 5   xG                  7667 non-null   float64
 6   assists             7667 non-null   int64  
 7   xA                  7667 non-null   float64
 8   shots               7667 non-null   int64  
 9   key_passes          7667 non-null   int64  
 10  npg                 7667 non-null   int64  
 11  npxG                7667 non-null   float64
 12  xGChain             7667 non-null   float64
 13  xGBuildup           7667 non-null   float64
 14  team_shots          7667 non-null   int64  
 15  team_goals          7667 non-null   int64  
 16  team_

  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_df['games'][i] < 60:
  if training_df['time'][i] / training_d

In [49]:
# Export the training dataframe to a csv file
training_df.to_csv('training_data_4.csv')