In [208]:
%reload_ext autoreload
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
from pipetools import pipe, utils, where
from fastcore.basics import typed
import requests
from fastcore.foundation import L
import pandas as pd
from typing import List
from pathlib import Path
import re
from datetime import datetime, date

In [2]:
team_urls = {
    "liverpool": "https://fbref.com/en/squads/822bd0ba/Liverpool-Stats",
    "aston_villa": "https://fbref.com/en/squads/8602292d/Aston-Villa-Stats",
    "leeds_united": "https://fbref.com/en/squads/5bfb9659/Leeds-United-Stats",
    "crystal_palace": "https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats",
    "chelsea": "https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats",
    "leicester_city": "https://fbref.com/en/squads/a2d435b3/Leicester-City-Stats",
    "wolves": "https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats",
    "tottenham_hotspurs": "https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats",
    "westham_united": "https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats",
    "manchester_city": "https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats",
    "everton": "https://fbref.com/en/squads/d3fd31cc/Everton-Stats",
    "southampton": "https://fbref.com/en/squads/33c895d4/Southampton-Stats",
    "newcastle_united": "https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats",
    "manchester_united": "https://fbref.com/en/squads/19538871/Manchester-United-Stats",
    "brighton": "https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats",
    "westbrom": "https://fbref.com/en/squads/60c6b05f/West-Bromwich-Albion-Stats",
    "burnley": "https://fbref.com/en/squads/943e8050/Burnley-Stats",
    "sheffield_united": "https://fbref.com/en/squads/1df6b87e/Sheffield-United-Stats",
    "fulham": "https://fbref.com/en/squads/fd962109/Fulham-Stats",
    "arsenal": "https://fbref.com/en/squads/18bb7c10/Arsenal-Stats"
}

In [3]:
len(team_urls)

20

### Brewing the soup

In [4]:
liv_content = requests.get(team_urls["arsenal"])

In [5]:
liv_content.status_code

200

In [6]:
liv_soup = BeautifulSoup(liv_content.content)

### Utilities

In [7]:
def format_age(age: str) -> str:
    spl = age.split("-")
    return f"{spl[0]} years and {spl[1]} days"

In [8]:
def to_float(text: str) -> float:
    if not text:
        return 0.0
    else:
        return text > pipe | str | float

In [9]:
type(to_float('0.222'))

float

### Shooting Stats

In [10]:
shooting = BeautifulSoup(liv_content.content).find(attrs={"id": "all_stats_shooting_10728"})

In [11]:
len(BeautifulSoup(shooting.contents[5]).find_all("tr")[2:-1])

19

In [12]:
shooting = BeautifulSoup(shooting.contents[5]).find_all("tr")[2:-1]

In [13]:
import collections

ShootingStats = collections.namedtuple(
    'ShootingStats', 
    'name profile_url nationality position age minutes_90s goals shots_total shots_on_target \
    shots_on_target_pct shots_total_per90 goals_per_shot goals_per_shot_on_target \
    shots_free_kicks pens_made pens_att xg npxg npxg_per_shot xg_net npxg_net')

In [14]:
@typed
def extract_shooting_stats(shooting: list) -> list:
    shooting_stats = []
    for stat in shooting:
        shooting_stats.append(ShootingStats(
            name=(stat.find("th").text),
            profile_url=f'https://fbref.com{stat.find("a").attrs["href"]}',
            nationality=stat.find(attrs={"data-stat": "nationality"}).text.split(" "),
            position=stat.find(attrs={"data-stat": "position"}).text,
            age=format_age(stat.find(attrs={"data-stat": "age"}).text),
            minutes_90s=float(stat.find(attrs={"data-stat": "minutes_90s"}).text),
            goals=int(stat.find(attrs={"data-stat": "goals"}).text),
            shots_total=int(stat.find(attrs={"data-stat": "shots_total"}).text),
            shots_on_target=int(stat.find(attrs={"data-stat": "shots_on_target"}).text),
            shots_on_target_pct=to_float(stat.find(attrs={"data-stat": "shots_on_target_pct"}).text),
            shots_total_per90=to_float(stat.find(attrs={"data-stat": "shots_total_per90"}).text),
            goals_per_shot=to_float(stat.find(attrs={"data-stat": "goals_per_shot"}).text),
            goals_per_shot_on_target=to_float(stat.find(attrs={"data-stat": "goals_per_shot_on_target"}).text),
            shots_free_kicks=to_float(stat.find(attrs={"data-stat": "shots_free_kicks"}).text),
            pens_made=int(stat.find(attrs={"data-stat": "pens_made"}).text),
            pens_att=int(stat.find(attrs={"data-stat": "pens_att"}).text),
            xg=to_float(stat.find(attrs={"data-stat": "xg"}).text),
            npxg=to_float(stat.find(attrs={"data-stat": "npxg"}).text),
            npxg_per_shot=to_float(stat.find(attrs={"data-stat": "npxg_per_shot"}).text),
            xg_net=stat.find(attrs={"data-stat": "xg_net"}).text,
            npxg_net=stat.find(attrs={"data-stat": "npxg_net"}).text
        ))
    return shooting_stats

In [15]:
extract_shooting_stats(shooting=shooting)[-1]

ShootingStats(name='Shkodran Mustafi', profile_url='https://fbref.com/en/players/3f2d59fe/Shkodran-Mustafi', nationality=['de', 'GER'], position='DF', age='28 years and 197 days', minutes_90s=0.5, goals=0, shots_total=0, shots_on_target=0, shots_on_target_pct=0.0, shots_total_per90=0.0, goals_per_shot=0.0, goals_per_shot_on_target=0.0, shots_free_kicks=0.0, pens_made=0, pens_att=0, xg=0.0, npxg=0.0, npxg_per_shot=0.0, xg_net='0.0', npxg_net='0.0')

### Passing Stats

In [16]:
passing = BeautifulSoup(liv_content.content).find(attrs={"id": "all_stats_passing_10728"})

In [17]:
len(BeautifulSoup(passing.contents[5]).find_all("tr")[2:-1])

19

In [18]:
passing = BeautifulSoup(passing.contents[5]).find_all("tr")[2:-1]

In [19]:
PassingStats = collections.namedtuple(
    'PassingStats',
    'name passes_completed passes passes_pct passes_total_distance passes_progressive_distance \
    passes_completed_short passes_short passes_pct_short passes_completed_medium passes_medium \
    passes_pct_medium passes_completed_long passes_long passes_pct_long assists xa xa_net assisted_shots \
    passes_into_final_third passes_into_penalty_area crosses_into_penalty_area progressive_passes'
)

In [20]:
@typed
def extract_passing_stats(passing: list) -> list:
    passing_stats = []
    for stat in passing:
        passing_stats.append(
            PassingStats(
                name=stat.find("th").text,
                passes_completed=int(stat.find(attrs={"data-stat": "passes_completed"}).text),
                passes=int(stat.find(attrs={"data-stat": "passes"}).text),
                passes_pct=to_float(stat.find(attrs={"data-stat": "passes_pct"}).text),
                passes_total_distance=to_float(stat.find(attrs={"data-stat": "passes_total_distance"}).text),
                passes_progressive_distance=to_float(stat.find(attrs={"data-stat": "passes_progressive_distance"}).text),
                passes_completed_short=int(stat.find(attrs={"data-stat": "passes_completed_short"}).text),
                passes_short=int(stat.find(attrs={"data-stat": "passes_short"}).text),
                passes_pct_short=to_float(stat.find(attrs={"data-stat": "passes_pct_short"}).text),
                passes_completed_medium=int(stat.find(attrs={"data-stat": "passes_completed_medium"}).text),
                passes_medium=int(stat.find(attrs={"data-stat": "passes_medium"}).text),
                passes_pct_medium=to_float(stat.find(attrs={"data-stat": "passes_pct_medium"}).text),
                passes_completed_long=int(stat.find(attrs={"data-stat": "passes_completed_long"}).text),
                passes_long=int(stat.find(attrs={"data-stat": "passes_long"}).text),
                passes_pct_long=to_float(stat.find(attrs={"data-stat": "passes_pct_long"}).text),
                assists=int(stat.find(attrs={"data-stat": "assists"}).text),
                xa=to_float(stat.find(attrs={"data-stat": "xa"}).text),
                xa_net=stat.find(attrs={"data-stat": "xa_net"}).text,
                assisted_shots=int(stat.find(attrs={"data-stat": "assisted_shots"}).text),
                passes_into_final_third=int(stat.find(attrs={"data-stat": "passes_into_final_third"}).text),
                passes_into_penalty_area=int(stat.find(attrs={"data-stat": "passes_into_penalty_area"}).text),
                crosses_into_penalty_area=int(stat.find(attrs={"data-stat": "crosses_into_penalty_area"}).text),
                progressive_passes=int(stat.find(attrs={"data-stat": "progressive_passes"}).text)
            )
        )
    return passing_stats

In [21]:
extract_passing_stats(passing=passing)[0]

PassingStats(name='Pierre-Emerick Aubameyang', passes_completed=116, passes=162, passes_pct=71.6, passes_total_distance=1865.0, passes_progressive_distance=404.0, passes_completed_short=64, passes_short=73, passes_pct_short=87.7, passes_completed_medium=43, passes_medium=60, passes_pct_medium=71.7, passes_completed_long=9, passes_long=22, passes_pct_long=40.9, assists=1, xa=1.1, xa_net='-0.1', assisted_shots=8, passes_into_final_third=5, passes_into_penalty_area=6, crosses_into_penalty_area=3, progressive_passes=17)

### Extra Passing stats

In [22]:
extra_passing = BeautifulSoup(liv_content.content).find(attrs={"id": "all_stats_passing_types_10728"})

In [23]:
len(BeautifulSoup(extra_passing.contents[5]).find_all("tr")[2:-1])

19

In [24]:
extra_passing = BeautifulSoup(extra_passing.contents[5]).find_all("tr")[2:-1]

In [25]:
ExtraPassingStats = collections.namedtuple(
    'ExtraPassingStats',
    'name passes_live passes_dead passes_free_kicks through_balls passes_pressure passes_switches\
    crosses corner_kicks corner_kicks_in corner_kicks_out corner_kicks_straight passes_ground\
    passes_low passes_high passes_left_foot passes_right_foot passes_head throw_ins passes_other_body\
    passes_completed passes_offsides passes_oob passes_intercepted passes_blocked'
)

In [26]:
@typed
def extract_extra_passing_stats(extra_passing: list) -> list:
    extra_passing_stats = []
    for stat in extra_passing:
        extra_passing_stats.append(
            ExtraPassingStats(
                name=stat.find("th").text,
                passes_live=int(stat.find(attrs={"data-stat": "passes_live"}).text),
                passes_dead=int(stat.find(attrs={"data-stat": "passes_dead"}).text),
                passes_free_kicks=int(stat.find(attrs={"data-stat": "passes_free_kicks"}).text),
                through_balls=int(stat.find(attrs={"data-stat": "through_balls"}).text),
                passes_pressure=int(stat.find(attrs={"data-stat": "passes_pressure"}).text),
                passes_switches=int(stat.find(attrs={"data-stat": "passes_switches"}).text),
                crosses=int(stat.find(attrs={"data-stat": "crosses"}).text),
                corner_kicks=int(stat.find(attrs={"data-stat": "corner_kicks"}).text),
                corner_kicks_in=int(stat.find(attrs={"data-stat": "corner_kicks_in"}).text),
                corner_kicks_out=int(stat.find(attrs={"data-stat": "corner_kicks_out"}).text),
                corner_kicks_straight=int(stat.find(attrs={"data-stat": "corner_kicks_straight"}).text),
                passes_ground=int(stat.find(attrs={"data-stat": "passes_ground"}).text),
                passes_low=int(stat.find(attrs={"data-stat": "passes_low"}).text),
                passes_high=int(stat.find(attrs={"data-stat": "passes_high"}).text),
                passes_left_foot=int(stat.find(attrs={"data-stat": "passes_left_foot"}).text),
                passes_right_foot=int(stat.find(attrs={"data-stat": "passes_right_foot"}).text),
                passes_head=int(stat.find(attrs={"data-stat": "passes_head"}).text),
                throw_ins=int(stat.find(attrs={"data-stat": "throw_ins"}).text),
                passes_other_body=int(stat.find(attrs={"data-stat": "passes_other_body"}).text),
                passes_completed=int(stat.find(attrs={"data-stat": "passes_completed"}).text),
                passes_offsides=int(stat.find(attrs={"data-stat": "passes_offsides"}).text),
                passes_oob=int(stat.find(attrs={"data-stat": "passes_oob"}).text),
                passes_intercepted=int(stat.find(attrs={"data-stat": "passes_intercepted"}).text),
                passes_blocked=int(stat.find(attrs={"data-stat": "passes_blocked"}).text)
            )
        )
    return extra_passing_stats

In [27]:
extract_extra_passing_stats(extra_passing=extra_passing)[0]

ExtraPassingStats(name='Pierre-Emerick Aubameyang', passes_live=160, passes_dead=2, passes_free_kicks=0, through_balls=4, passes_pressure=36, passes_switches=2, crosses=16, corner_kicks=0, corner_kicks_in=0, corner_kicks_out=0, corner_kicks_straight=0, passes_ground=104, passes_low=30, passes_high=28, passes_left_foot=17, passes_right_foot=130, passes_head=6, throw_ins=0, passes_other_body=3, passes_completed=116, passes_offsides=2, passes_oob=3, passes_intercepted=3, passes_blocked=5)

### Goal Creation Stats

In [28]:
gca = BeautifulSoup(liv_content.content).find(attrs={"id": "all_stats_gca_10728"})

In [29]:
len(BeautifulSoup(gca.contents[5]).find_all("tr")[2:-1])

19

In [30]:
gca = BeautifulSoup(gca.contents[5]).find_all("tr")[2:-1]

In [31]:
GCAStats = collections.namedtuple(
    'GCAStats',
    'name sca sca_per90 sca_passes_live sca_passes_dead sca_dribbles sca_shots sca_fouled gca\
    gca_per90 gca_passes_live gca_passes_dead gca_dribbles gca_shots gca_fouled gca_og_for'
)

In [32]:
@typed
def extract_gca_stats(gca: list) -> list:
    gca_stats = []
    for stat in gca:
        gca_stats.append(
            GCAStats(
                name=stat.find("th").text,
                sca=int(stat.find(attrs={"data-stat": "sca"}).text),
                sca_per90=to_float(stat.find(attrs={"data-stat": "sca_per90"}).text),
                sca_passes_live=int(stat.find(attrs={"data-stat": "sca_passes_live"}).text),
                sca_passes_dead=int(stat.find(attrs={"data-stat": "sca_passes_dead"}).text),
                sca_dribbles=int(stat.find(attrs={"data-stat": "sca_dribbles"}).text),
                sca_shots=int(stat.find(attrs={"data-stat": "sca_shots"}).text),
                sca_fouled=int(stat.find(attrs={"data-stat": "sca_fouled"}).text),
                gca=int(stat.find(attrs={"data-stat": "gca"}).text),
                gca_per90=to_float(stat.find(attrs={"data-stat": "gca_per90"}).text),
                gca_passes_live=int(stat.find(attrs={"data-stat": "gca_passes_live"}).text),
                gca_passes_dead=int(stat.find(attrs={"data-stat": "gca_passes_dead"}).text),
                gca_dribbles=int(stat.find(attrs={"data-stat": "gca_dribbles"}).text),
                gca_shots=int(stat.find(attrs={"data-stat": "gca_shots"}).text),
                gca_fouled=int(stat.find(attrs={"data-stat": "gca_fouled"}).text),
                gca_og_for=int(stat.find(attrs={"data-stat": "gca_og_for"}).text)
            )
        )
    return gca_stats

In [33]:
extract_gca_stats(gca=gca)[0]

GCAStats(name='Pierre-Emerick Aubameyang', sca=16, sca_per90=2.67, sca_passes_live=12, sca_passes_dead=0, sca_dribbles=1, sca_shots=1, sca_fouled=2, gca=2, gca_per90=0.33, gca_passes_live=2, gca_passes_dead=0, gca_dribbles=0, gca_shots=0, gca_fouled=0, gca_og_for=0)

### Defensive Stats

In [34]:
defensive = BeautifulSoup(liv_content.content).find(attrs={"id": "all_stats_defense_10728"})

In [35]:
len(BeautifulSoup(defensive.contents[5]).find_all("tr")[2:-1])

19

In [36]:
defensive = BeautifulSoup(defensive.contents[5]).find_all("tr")[2:-1]

In [37]:
DefensiveActions = collections.namedtuple(
    'DefensiveActions',
    'name tackles tackles_won tackles_def_3rd tackles_mid_3rd tackles_att_3rd dribble_tackles dribbles_vs\
    dribble_tackles_pct dribbled_past pressures pressure_regains pressure_regain_pct pressures_def_3rd\
    pressures_mid_3rd pressures_att_3rd blocks blocked_shots blocked_shots_saves blocked_passes interceptions\
    tackles_interceptions clearances errors'
)

In [38]:
@typed
def extract_defensive_actions(defensive_actions: list) -> list:
    defensive_actions = []
    for stat in defensive:
        defensive_actions.append(
            DefensiveActions(
                name=stat.find("th").text,
                tackles=int(stat.find(attrs={"data-stat": "tackles"}).text),
                tackles_won=int(stat.find(attrs={"data-stat": "tackles_won"}).text),
                tackles_def_3rd=int(stat.find(attrs={"data-stat": "tackles_def_3rd"}).text),
                tackles_mid_3rd=int(stat.find(attrs={"data-stat": "tackles_mid_3rd"}).text),
                tackles_att_3rd=int(stat.find(attrs={"data-stat": "tackles_att_3rd"}).text),
                dribble_tackles=int(stat.find(attrs={"data-stat": "dribble_tackles"}).text),
                dribbles_vs=int(stat.find(attrs={"data-stat": "dribbles_vs"}).text),
                dribble_tackles_pct=to_float(stat.find(attrs={"data-stat": "dribble_tackles_pct"}).text),
                dribbled_past=int(stat.find(attrs={"data-stat": "dribbled_past"}).text),
                pressures=int(stat.find(attrs={"data-stat": "pressures"}).text),
                pressure_regains=int(stat.find(attrs={"data-stat": "pressure_regains"}).text),
                pressure_regain_pct=to_float(stat.find(attrs={"data-stat": "pressure_regain_pct"}).text),
                pressures_def_3rd=int(stat.find(attrs={"data-stat": "pressures_def_3rd"}).text),
                pressures_mid_3rd=int(stat.find(attrs={"data-stat": "pressures_mid_3rd"}).text),
                pressures_att_3rd=int(stat.find(attrs={"data-stat": "pressures_att_3rd"}).text),
                blocks=int(stat.find(attrs={"data-stat": "blocks"}).text),
                blocked_shots=int(stat.find(attrs={"data-stat": "blocked_shots"}).text),
                blocked_shots_saves=int(stat.find(attrs={"data-stat": "blocked_shots_saves"}).text),
                blocked_passes=int(stat.find(attrs={"data-stat": "blocked_passes"}).text),
                interceptions=int(stat.find(attrs={"data-stat": "interceptions"}).text),
                tackles_interceptions=int(stat.find(attrs={"data-stat": "tackles_interceptions"}).text),
                clearances=int(stat.find(attrs={"data-stat": "clearances"}).text),
                errors=int(stat.find(attrs={"data-stat": "errors"}).text),
            )
        )
    return defensive_actions

In [39]:
extract_defensive_actions(defensive_actions=defensive)[0]

DefensiveActions(name='Pierre-Emerick Aubameyang', tackles=5, tackles_won=2, tackles_def_3rd=0, tackles_mid_3rd=4, tackles_att_3rd=1, dribble_tackles=2, dribbles_vs=5, dribble_tackles_pct=40.0, dribbled_past=3, pressures=84, pressure_regains=20, pressure_regain_pct=23.8, pressures_def_3rd=15, pressures_mid_3rd=37, pressures_att_3rd=32, blocks=6, blocked_shots=0, blocked_shots_saves=0, blocked_passes=6, interceptions=0, tackles_interceptions=5, clearances=3, errors=0)

### Possession Stats

In [40]:
possesion = BeautifulSoup(liv_content.content).find(attrs={"id": "all_stats_possession_10728"})

In [41]:
len(BeautifulSoup(possesion.contents[5]).find_all("tr")[2:-1])

19

In [42]:
possesion = BeautifulSoup(possesion.contents[5]).find_all("tr")[2:-1]

In [43]:
PossessionStats = collections.namedtuple(
    'PossessionStats',
    'name touches touches_def_pen_area touches_def_3rd touches_mid_3rd touches_att_3rd touches_att_pen_area\
    touches_live_ball dribbles_completed dribbles dribbles_completed_pct players_dribbled_past nutmegs\
    carries carry_distance carry_progressive_distance pass_targets passes_received passes_received_pct\
    miscontrols dispossessed'
)

In [44]:
@typed
def extract_possession_stats(possesion: list) -> list:
    possesion_stats = []
    for stat in possesion:
        possesion_stats.append(
            PossessionStats(
                name=stat.find("th").text,
                touches=int(stat.find(attrs={"data-stat": "touches"}).text),
                touches_def_pen_area=int(stat.find(attrs={"data-stat": "touches_def_pen_area"}).text),
                touches_def_3rd=int(stat.find(attrs={"data-stat": "touches_def_3rd"}).text),
                touches_mid_3rd=int(stat.find(attrs={"data-stat": "touches_mid_3rd"}).text),
                touches_att_3rd=int(stat.find(attrs={"data-stat": "touches_att_3rd"}).text),
                touches_att_pen_area=int(stat.find(attrs={"data-stat": "touches_att_pen_area"}).text),
                touches_live_ball=int(stat.find(attrs={"data-stat": "touches_live_ball"}).text),
                dribbles_completed=int(stat.find(attrs={"data-stat": "dribbles_completed"}).text),
                dribbles=int(stat.find(attrs={"data-stat": "dribbles"}).text),
                dribbles_completed_pct=to_float(stat.find(attrs={"data-stat": "dribbles_completed_pct"}).text),
                players_dribbled_past=int(stat.find(attrs={"data-stat": "players_dribbled_past"}).text),
                nutmegs=int(stat.find(attrs={"data-stat": "nutmegs"}).text),
                carries=int(stat.find(attrs={"data-stat": "carries"}).text),
                carry_distance=int(stat.find(attrs={"data-stat": "carry_distance"}).text),
                carry_progressive_distance=int(stat.find(attrs={"data-stat": "carry_progressive_distance"}).text),
                pass_targets=int(stat.find(attrs={"data-stat": "pass_targets"}).text),
                passes_received=int(stat.find(attrs={"data-stat": "passes_received"}).text),
                passes_received_pct=to_float(stat.find(attrs={"data-stat": "passes_received_pct"}).text),
                miscontrols=int(stat.find(attrs={"data-stat": "miscontrols"}).text),
                dispossessed=int(stat.find(attrs={"data-stat": "dispossessed"}).text),
            )
        )
    return possesion_stats

In [45]:
extract_possession_stats(possesion=possesion)[0]

PossessionStats(name='Pierre-Emerick Aubameyang', touches=204, touches_def_pen_area=3, touches_def_3rd=26, touches_mid_3rd=85, touches_att_3rd=106, touches_att_pen_area=18, touches_live_ball=202, dribbles_completed=3, dribbles=8, dribbles_completed_pct=37.5, players_dribbled_past=4, nutmegs=0, carries=130, carry_distance=921, carry_progressive_distance=529, pass_targets=270, passes_received=165, passes_received_pct=61.1, miscontrols=5, dispossessed=6)

### Playing Time Stats

In [46]:
playing_time = BeautifulSoup(liv_content.content).find(attrs={"id": "all_stats_playing_time_10728"})

In [47]:
len(BeautifulSoup(playing_time.contents[5]).find_all("tr")[2:-1])

22

In [48]:
playing_time = BeautifulSoup(playing_time.contents[5]).find_all("tr")[2:-1]

In [49]:
PlayingTimeStats = collections.namedtuple(
    'PlayingTimeStats',
    'name games minutes minutes_per_game minutes_pct minutes_90s games_starts minutes_per_start\
    games_subs minutes_per_sub unused_subs points_per_match on_goals_for on_goals_against plus_minus\
    plus_minus_per90 plus_minus_wowy on_xg_for on_xg_against xg_plus_minus xg_plus_minus_per90\
    xg_plus_minus_wowy'
)

In [50]:
@typed
def extract_playing_time_stats(playing_time: list) -> list:
    playing_time_stats = []
    for stat in playing_time:
        playing_time_stats.append(
            PlayingTimeStats(
                name=stat.find("th").text,
                games=int(stat.find(attrs={"data-stat": "games"}).text),
                minutes=to_float(stat.find(attrs={"data-stat": "minutes"}).text),
                minutes_per_game=to_float(stat.find(attrs={"data-stat": "minutes_per_game"}).text),
                minutes_pct=to_float(stat.find(attrs={"data-stat": "minutes_pct"}).text),
                minutes_90s=to_float(stat.find(attrs={"data-stat": "minutes_90s"}).text),
                games_starts=int(stat.find(attrs={"data-stat": "games_starts"}).text),
                minutes_per_start=to_float(stat.find(attrs={"data-stat": "minutes_per_start"}).text),
                games_subs=int(stat.find(attrs={"data-stat": "games_subs"}).text),
                minutes_per_sub=to_float(stat.find(attrs={"data-stat": "minutes_per_sub"}).text),
                unused_subs=int(stat.find(attrs={"data-stat": "unused_subs"}).text),
                points_per_match=to_float(stat.find(attrs={"data-stat": "points_per_match"}).text),
                on_goals_for=to_float(stat.find(attrs={"data-stat": "on_goals_for"}).text),
                on_goals_against=to_float(stat.find(attrs={"data-stat": "on_goals_against"}).text),
                plus_minus=stat.find(attrs={"data-stat": "plus_minus"}).text,
                plus_minus_per90=stat.find(attrs={"data-stat": "plus_minus_per90"}).text,
                plus_minus_wowy=stat.find(attrs={"data-stat": "plus_minus_wowy"}).text,
                on_xg_for=to_float(stat.find(attrs={"data-stat": "on_xg_for"}).text),
                on_xg_against=to_float(stat.find(attrs={"data-stat": "on_xg_against"}).text),
                xg_plus_minus=stat.find(attrs={"data-stat": "xg_plus_minus"}).text,
                xg_plus_minus_per90=stat.find(attrs={"data-stat": "xg_plus_minus_per90"}).text,
                xg_plus_minus_wowy=stat.find(attrs={"data-stat": "xg_plus_minus_wowy"}).text,
            )
        )
    return playing_time_stats

In [51]:
extract_playing_time_stats(playing_time=playing_time)[0]

PlayingTimeStats(name='Pierre-Emerick Aubameyang', games=6, minutes=540.0, minutes_per_game=90.0, minutes_pct=100.0, minutes_90s=6.0, games_starts=6, minutes_per_start=90.0, games_subs=0, minutes_per_sub=0.0, unused_subs=0, points_per_match=1.5, on_goals_for=8.0, on_goals_against=7.0, plus_minus='+1', plus_minus_per90='+0.17', plus_minus_wowy='', on_xg_for=6.5, on_xg_against=7.9, xg_plus_minus='-1.3', xg_plus_minus_per90='-0.22', xg_plus_minus_wowy='')

### Misc Stats

In [52]:
misc = BeautifulSoup(liv_content.content).find(attrs={"id": "all_stats_misc_10728"})

In [53]:
len(BeautifulSoup(misc.contents[5]).find_all("tr")[2:-1])

19

In [54]:
misc = BeautifulSoup(misc.contents[5]).find_all("tr")[2:-1]

In [55]:
MiscStats = collections.namedtuple(
    'MiscStats',
    'name cards_yellow cards_red cards_yellow_red fouls fouled offsides crosses interceptions\
    tackles_won pens_won pens_conceded own_goals ball_recoveries aerials_won aerials_lost aerials_won_pct'
)

In [56]:
@typed
def extract_misc_stats(misc: list) -> list:
    misc_stats = []
    for stat in misc:
        misc_stats.append(
            MiscStats(
                name=stat.find("th").text,
                cards_yellow=int(stat.find(attrs={"data-stat": "cards_yellow"}).text),
                cards_red=int(stat.find(attrs={"data-stat": "cards_red"}).text),
                cards_yellow_red=int(stat.find(attrs={"data-stat": "cards_yellow_red"}).text),
                fouls=int(stat.find(attrs={"data-stat": "fouls"}).text),
                fouled=int(stat.find(attrs={"data-stat": "fouled"}).text),
                offsides=int(stat.find(attrs={"data-stat": "offsides"}).text),
                crosses=int(stat.find(attrs={"data-stat": "crosses"}).text),
                interceptions=int(stat.find(attrs={"data-stat": "interceptions"}).text),
                tackles_won=int(stat.find(attrs={"data-stat": "tackles_won"}).text),
                pens_won=int(stat.find(attrs={"data-stat": "pens_won"}).text),
                pens_conceded=int(stat.find(attrs={"data-stat": "pens_conceded"}).text),
                own_goals=int(stat.find(attrs={"data-stat": "own_goals"}).text),
                ball_recoveries=int(stat.find(attrs={"data-stat": "ball_recoveries"}).text),
                aerials_won=int(stat.find(attrs={"data-stat": "aerials_won"}).text),
                aerials_lost=int(stat.find(attrs={"data-stat": "aerials_lost"}).text),
                aerials_won_pct=to_float(stat.find(attrs={"data-stat": "aerials_won_pct"}).text),
            )
        )
    return misc_stats

In [57]:
extract_misc_stats(misc=misc)[0]

MiscStats(name='Pierre-Emerick Aubameyang', cards_yellow=1, cards_red=0, cards_yellow_red=0, fouls=4, fouled=5, offsides=6, crosses=16, interceptions=0, tackles_won=2, pens_won=0, pens_conceded=0, own_goals=0, ball_recoveries=21, aerials_won=6, aerials_lost=9, aerials_won_pct=40.0)

### Goalkeeping stats

## FPL Data

In [58]:
fpl_request = requests.get("https://fantasy.premierleague.com/api/bootstrap-static/")

In [59]:
all_fpl_data = fpl_request.content

In [60]:
all_fpl_data = json.loads(all_fpl_data)

In [61]:
all_fpl_data.keys()

dict_keys(['events', 'game_settings', 'phases', 'teams', 'total_players', 'elements', 'element_stats', 'element_types'])

In [62]:
all_fpl_data["total_players"]

7360165

In [160]:
fpl_players = L(all_fpl_data["elements"])

In [161]:
fpl_teams = L(all_fpl_data["teams"])

In [162]:
fpl_teams[0]

{'code': 3,
 'draw': 0,
 'form': None,
 'id': 1,
 'loss': 0,
 'name': 'Arsenal',
 'played': 0,
 'points': 0,
 'position': 0,
 'short_name': 'ARS',
 'strength': 4,
 'team_division': None,
 'unavailable': False,
 'win': 0,
 'strength_overall_home': 1240,
 'strength_overall_away': 1250,
 'strength_attack_home': 1160,
 'strength_attack_away': 1210,
 'strength_defence_home': 1190,
 'strength_defence_away': 1230,
 'pulse_id': 1}

In [163]:
@typed
def extract_team_details(team: dict) -> dict:
    return {str(team["code"]): {"name": team["name"].lower().replace(" ", "_"), "slug": team["short_name"]}}

In [166]:
fpl_team_mapper = {}
[fpl_team_mapper.update(extract_team_details(team)) for team in fpl_teams]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [167]:
fpl_team_mapper

{'3': {'name': 'arsenal', 'slug': 'ARS'},
 '7': {'name': 'aston_villa', 'slug': 'AVL'},
 '36': {'name': 'brighton', 'slug': 'BHA'},
 '90': {'name': 'burnley', 'slug': 'BUR'},
 '8': {'name': 'chelsea', 'slug': 'CHE'},
 '31': {'name': 'crystal_palace', 'slug': 'CRY'},
 '11': {'name': 'everton', 'slug': 'EVE'},
 '54': {'name': 'fulham', 'slug': 'FUL'},
 '13': {'name': 'leicester', 'slug': 'LEI'},
 '2': {'name': 'leeds', 'slug': 'LEE'},
 '14': {'name': 'liverpool', 'slug': 'LIV'},
 '43': {'name': 'man_city', 'slug': 'MCI'},
 '1': {'name': 'man_utd', 'slug': 'MUN'},
 '4': {'name': 'newcastle', 'slug': 'NEW'},
 '49': {'name': 'sheffield_utd', 'slug': 'SHU'},
 '20': {'name': 'southampton', 'slug': 'SOU'},
 '6': {'name': 'spurs', 'slug': 'TOT'},
 '35': {'name': 'west_brom', 'slug': 'WBA'},
 '21': {'name': 'west_ham', 'slug': 'WHU'},
 '39': {'name': 'wolves', 'slug': 'WOL'}}

In [133]:
all_fpl_data["events"][0]

{'id': 1,
 'name': 'Gameweek 1',
 'deadline_time': '2020-09-12T10:00:00Z',
 'average_entry_score': 50,
 'finished': True,
 'data_checked': True,
 'highest_scoring_entry': 4761681,
 'deadline_time_epoch': 1599904800,
 'deadline_time_game_offset': 0,
 'highest_score': 142,
 'is_previous': False,
 'is_current': False,
 'is_next': False,
 'chip_plays': [{'chip_name': 'bboost', 'num_played': 112843},
  {'chip_name': '3xc', 'num_played': 225426}],
 'most_selected': 259,
 'most_transferred_in': 12,
 'top_element': 254,
 'top_element_info': {'id': 254, 'points': 20},
 'transfers_made': 0,
 'most_captained': 4,
 'most_vice_captained': 4}

In [134]:
all_fpl_data["phases"][3]

{'id': 4, 'name': 'November', 'start_event': 8, 'stop_event': 10}

In [146]:
category_types = {}
[category_types.update({str(player_cat["id"]): player_cat["singular_name_short"]}) for player_cat in all_fpl_data["element_types"]]

[None, None, None, None]

In [147]:
category_types

{'1': 'GKP', '2': 'DEF', '3': 'MID', '4': 'FWD'}

In [137]:
all_fpl_data["element_stats"]

[{'label': 'Minutes played', 'name': 'minutes'},
 {'label': 'Goals scored', 'name': 'goals_scored'},
 {'label': 'Assists', 'name': 'assists'},
 {'label': 'Clean sheets', 'name': 'clean_sheets'},
 {'label': 'Goals conceded', 'name': 'goals_conceded'},
 {'label': 'Own goals', 'name': 'own_goals'},
 {'label': 'Penalties saved', 'name': 'penalties_saved'},
 {'label': 'Penalties missed', 'name': 'penalties_missed'},
 {'label': 'Yellow cards', 'name': 'yellow_cards'},
 {'label': 'Red cards', 'name': 'red_cards'},
 {'label': 'Saves', 'name': 'saves'},
 {'label': 'Bonus', 'name': 'bonus'},
 {'label': 'Bonus Points System', 'name': 'bps'},
 {'label': 'Influence', 'name': 'influence'},
 {'label': 'Creativity', 'name': 'creativity'},
 {'label': 'Threat', 'name': 'threat'},
 {'label': 'ICT Index', 'name': 'ict_index'}]

In [182]:
@typed
def extract_player_fpl_stats(player: dict) -> dict:
    return {
        "player_id": player["id"],
        "name": f"{player['first_name']} {player['second_name']}",
        "display_slug": player["web_name"],
        "team_id": player["team"],
        "team": fpl_team_mapper.get(str(player["team_code"])).get("name"),
        "team_slug": fpl_team_mapper.get(str(player["team_code"])).get("slug"),
        "position": category_types.get(str(player["element_type"])),
        "points_per_game": to_float(player["points_per_game"]),
        "form": to_float(player["form"]),
        "news": player["news"],
        "news_added_at": player["news_added"],
        "status": player["status"],
        "total_points": player["total_points"],
        "value_form": to_float(player["value_form"]),
        "value_season": to_float(player["value_season"]),
        "selected_by_percent": to_float(player["selected_by_percent"]),
        "points_per_game": to_float(player["points_per_game"]),
        "influence_rank_overall": player["influence_rank"],
        "creativity_rank_overall": player["creativity_rank"],
        "threat_rank_overall": player["threat_rank"],
        "influence_rank_by_position": player["influence_rank_type"],
        "creativity_rank_by_position": player["creativity_rank_type"],
        "threat_rank_by_position:": player["threat_rank_type"],
        "ict_index": player["ict_index_rank"],
        "ict_index_by_position": player["ict_index_rank_type"],
        "dreamteam_count": player["dreamteam_count"],
        "transfers_in": player["transfers_in"],
        "transfers_out": player["transfers_out"],
        "chance_of_playing_next_round": player["chance_of_playing_next_round"],
        "chance_of_playing_this_round": player["chance_of_playing_this_round"],
        "cost_change_event": player["cost_change_event"],
        "cost_change_event_fall": player["cost_change_event_fall"],
        "cost_change_start": player["cost_change_start"],
        "cost_change_start_fall": player["cost_change_start_fall"],
        "ep_next": player["ep_next"],
        "ep_this": player["ep_this"],
        "now_cost": player["now_cost"],
        "transfers_in": player["transfers_in"],
        "transfers_out": player["transfers_out"],
        "transfers_in_event": player["transfers_in_event"],
        "transfers_out_event": player["transfers_out_event"],
        "goals_scored": player["goals_scored"],
        "assists": player["assists"],
        "clean_sheets": player["clean_sheets"],
        "goals_conceded": player["goals_conceded"],
        "penalties_saved": player["penalties_saved"],
        "penalties_missed": player["penalties_missed"],
        "saves": player["saves"],
        "bonus": player["bonus"],
        "bps": player["bps"],
        "corners_and_indirect_freekicks_order": player["corners_and_indirect_freekicks_order"],
        "corners_and_indirect_freekicks_text": player["corners_and_indirect_freekicks_text"],
        "direct_freekicks_order": player["direct_freekicks_order"],
        "direct_freekicks_text": player["direct_freekicks_text"],
        "penalties_order": player["penalties_order"],
        "penalties_text": player["penalties_text"],
    }

In [183]:
%time fpl_player_stats = L([extract_player_fpl_stats(player) for player in fpl_players])

CPU times: user 30.3 ms, sys: 0 ns, total: 30.3 ms
Wall time: 29.6 ms


In [186]:
fpl_player_stats_df = pd.DataFrame.from_dict(fpl_player_stats)

In [188]:
FPL_DATA = Path("../../data/fpl-data")

In [204]:
date.today()

datetime.date(2020, 10, 31)

In [205]:
t_now = re.sub(r'\D', '', str(date.today()))

In [207]:
FPL_DATA.mkdir(parents=True, exist_ok=True)
fpl_player_stats_df.to_csv(FPL_DATA / f"{t_now}_player_stats.csv")

### Basic EDA

In [212]:
fpl_player_stats_df.head()

Unnamed: 0,player_id,name,display_slug,team_id,team,team_slug,position,points_per_game,form,news,...,penalties_missed,saves,bonus,bps,corners_and_indirect_freekicks_order,corners_and_indirect_freekicks_text,direct_freekicks_order,direct_freekicks_text,penalties_order,penalties_text
0,1,Mesut Özil,Özil,1,arsenal,ARS,MID,0.0,0.0,Not included in Arsenal's 25-man Premier Leagu...,...,0,0,0,0,,,,,,
1,2,Sokratis Papastathopoulos,Sokratis,1,arsenal,ARS,DEF,0.0,0.0,Not included in Arsenal's 25-man Premier Leagu...,...,0,0,0,0,,,,,,
2,3,David Luiz Moreira Marinho,David Luiz,1,arsenal,ARS,DEF,1.4,1.2,Thigh injury - 50% chance of playing,...,0,0,0,51,,,4.0,,,
3,4,Pierre-Emerick Aubameyang,Aubameyang,1,arsenal,ARS,MID,3.3,1.5,,...,0,0,0,73,,,6.0,,1.0,
4,5,Cédric Soares,Cédric,1,arsenal,ARS,DEF,0.0,0.0,,...,0,0,0,0,,,,,,


In [219]:
fpl_player_stats_df.groupby(["team_slug", "position"])["player_id"].count().reset_index().rename(
    columns={'player_id': 'count'})

Unnamed: 0,team_slug,position,count
0,ARS,DEF,13
1,ARS,FWD,2
2,ARS,GKP,3
3,ARS,MID,15
4,AVL,DEF,10
...,...,...,...
75,WHU,MID,11
76,WOL,DEF,12
77,WOL,FWD,4
78,WOL,GKP,3


### Reading on statistics

- [ICT Index](https://www.premierleague.com/news/65567)