# Barclay's Premiere League: Player Stats

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import altair_stiles as altstiles

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
pd.set_option("display.max_colwidth", None)
alt.themes.register("stiles", altstiles.theme)
alt.themes.enable("stiles")

ThemeRegistry.enable('stiles')

In [4]:
today = pd.Timestamp("today").strftime("%m/%d/%Y")

---

## Get data

#### Read stats from 2000-2023

In [5]:
src_df = pd.read_csv("data/processed/all_players_stats_archive_current.csv")

---

In [6]:
nations_lookup = pd.read_html("https://fbref.com/en/countries/")[0][
    ["Country", "Governing Body", "Flag"]
]
nations_lookup.columns = nations_lookup.columns.str.lower().str.replace(" ", "_")

In [7]:
nations_lookup.set_index("flag", inplace=True)
nation_dict = nations_lookup.to_dict()["country"]

---

## Clean

#### Tidy up column names, dtypes, etc. 

In [8]:
src_df.columns = (
    src_df.columns.str.lower()
    .str.replace(".", "_", regex=False)
    .str.replace("+", "_", regex=False)
)

In [9]:
src_df["flag"] = src_df["nation"].str.split(" ", expand=True)[0]
src_df["nation"] = src_df["nation"].str.split(" ", expand=True)[1]

In [10]:
src_slim = src_df[
    [
        "player",
        "nation",
        "flag",
        "pos",
        "age",
        "mp",
        "starts",
        "min",
        "90s",
        "gls",
        "ast",
        "g-pk",
        "pk",
        "pkatt",
        "crdy",
        "crdr",
        "gls_1",
        "ast_1",
        "g_a",
        "g-pk_1",
        "g_a-pk",
        "squad",
        "season",
    ]
]

In [11]:
src_slim.columns = [
    "player",
    "nation",
    "flag",
    "position",
    "age",
    "matches",
    "starts",
    "minutes",
    "full_matches",
    "yellow_cards",
    "red_cards",
    "goals",
    "assists",
    "goals_no_pk",
    "penalty_kicks",
    "penalty_kicks_attempted",
    "goals_90m",
    "assists_90m",
    "goals_assists_90m",
    "goals_no_pk_90m",
    "goals_assists_no_pk_90m",
    "squad",
    "season",
]

In [12]:
df = src_slim.copy()

In [13]:
df["country"] = df["flag"].map(nation_dict)

In [14]:
df

Unnamed: 0,player,nation,flag,position,age,matches,starts,minutes,full_matches,yellow_cards,red_cards,goals,assists,goals_no_pk,penalty_kicks,penalty_kicks_attempted,goals_90m,assists_90m,goals_assists_90m,goals_no_pk_90m,goals_assists_no_pk_90m,squad,season,country
0,Gary Neville,ENG,eng,DF,25.0,32,32,2849.0,31.7,1.0,1.0,1.0,0.0,0.0,4.0,0.0,0.03,0.03,0.06,0.03,0.06,Manchester United,2000-2001,England
1,Fabien Barthez,FRA,fr,GK,29.0,30,30,2675.0,29.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,Manchester United,2000-2001,France
2,David Beckham,ENG,eng,MF,25.0,31,29,2648.0,29.4,9.0,12.0,8.0,1.0,1.0,3.0,0.0,0.31,0.41,0.71,0.27,0.68,Manchester United,2000-2001,England
3,Paul Scholes,ENG,eng,MF,25.0,32,28,2450.0,27.2,6.0,5.0,6.0,0.0,1.0,3.0,0.0,0.22,0.18,0.40,0.22,0.40,Manchester United,2000-2001,England
4,Roy Keane,IRL,ie,MF,28.0,28,28,2380.0,26.4,2.0,7.0,2.0,0.0,0.0,2.0,1.0,0.08,0.26,0.34,0.08,0.34,Manchester United,2000-2001,Republic of Ireland
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13355,Ryan Finnigan,ENG,eng,MF,19-115,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,Southampton,2022-2023,England
13356,Alex McCarthy,ENG,eng,GK,33-044,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,Southampton,2022-2023,England
13357,Jimmy Morgan,ENG,eng,FW,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,Southampton,2022-2023,England
13358,Mislav Oršić,CRO,hr,"FW,MF",30-018,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.00,Southampton,2022-2023,Croatia


---

## Aggregate

#### What share of the league is from the UK?

In [15]:
uk = ["England", "Wales", "Scotland", "Northern Ireland"]

In [16]:
df["uk"] = df["country"].isin(uk)

In [17]:
df.head()

Unnamed: 0,player,nation,flag,position,age,matches,starts,minutes,full_matches,yellow_cards,red_cards,goals,assists,goals_no_pk,penalty_kicks,penalty_kicks_attempted,goals_90m,assists_90m,goals_assists_90m,goals_no_pk_90m,goals_assists_no_pk_90m,squad,season,country,uk
0,Gary Neville,ENG,eng,DF,25.0,32,32,2849.0,31.7,1.0,1.0,1.0,0.0,0.0,4.0,0.0,0.03,0.03,0.06,0.03,0.06,Manchester United,2000-2001,England,True
1,Fabien Barthez,FRA,fr,GK,29.0,30,30,2675.0,29.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Manchester United,2000-2001,France,False
2,David Beckham,ENG,eng,MF,25.0,31,29,2648.0,29.4,9.0,12.0,8.0,1.0,1.0,3.0,0.0,0.31,0.41,0.71,0.27,0.68,Manchester United,2000-2001,England,True
3,Paul Scholes,ENG,eng,MF,25.0,32,28,2450.0,27.2,6.0,5.0,6.0,0.0,1.0,3.0,0.0,0.22,0.18,0.4,0.22,0.4,Manchester United,2000-2001,England,True
4,Roy Keane,IRL,ie,MF,28.0,28,28,2380.0,26.4,2.0,7.0,2.0,0.0,0.0,2.0,1.0,0.08,0.26,0.34,0.08,0.34,Manchester United,2000-2001,Republic of Ireland,False


In [18]:
uk_seasons = (
    df.groupby(["uk", "season"])
    .agg({"player": "count"})
    .reset_index()
    .rename(columns={"player": "count"})
)

In [19]:
uk_seasons.head()

Unnamed: 0,uk,season,count
0,False,2000-2001,241
1,False,2001-2002,262
2,False,2002-2003,280
3,False,2003-2004,277
4,False,2004-2005,281


In [20]:
uk_seasons_wide = (
    uk_seasons.pivot_table(index="season", columns="uk", values="count")
    .reset_index()
    .rename(columns={False: "not_uk", True: "uk"})
)

In [21]:
uk_seasons_wide["total"] = uk_seasons_wide["not_uk"] + uk_seasons_wide["uk"]

In [22]:
uk_seasons_wide["share"] = (
    (uk_seasons_wide["uk"] / uk_seasons_wide["total"]) * 100
).round(2)

In [23]:
alt.Chart(uk_seasons).mark_bar().encode(
    x=alt.X("count", stack="normalize"), y="season", color="uk"
)

  for col_name, dtype in df.dtypes.iteritems():


---

In [24]:
country_totals_all_seasons = (
    df.groupby(["country"])
    .agg({"player": "count"})
    .reset_index()
    .rename(columns={"player": "players"})
)

In [51]:
top_countries = (
    country_totals_all_seasons.sort_values("players", ascending=False)
    .head(21)["country"]
    .to_list()
)

In [52]:
total_players = (
    df.groupby(["season"])
    .agg({"player": "count"})
    .reset_index()
    .rename(columns={"player": "all_players"})
)

In [53]:
players_country = (
    df.groupby(["country", "season"])
    .agg({"player": "count"})
    .reset_index()
    .rename(columns={"player": "players"})
)

In [54]:
players_country["perc"] = players_country.groupby("season", group_keys=False)[
    "players"
].apply(lambda x: (x * 100 / x.sum()).round(2))

In [55]:
seasons = ["2000-2001", "2022-2023"]

In [56]:
then_now_top_countries = players_country[
    (players_country["country"].isin(top_countries))
    & (players_country["season"].isin(seasons))
    & (players_country["country"] != "England")
].copy()

In [57]:
then_now_top_countries["season_short"] = then_now_top_countries["season"].str[:4]

In [85]:
then_now_top_countries.pivot_table(
    index="country", columns="season_short", values="perc"
).reset_index()

season_short,country,2000,2022
0,Argentina,1.29,1.61
1,Australia,1.84,0.16
2,Belgium,0.74,1.93
3,Brazil,0.55,5.14
4,Denmark,0.74,2.25
5,France,4.23,5.63
6,Germany,1.47,1.93
7,Italy,2.02,0.8
8,Netherlands,2.94,2.25
9,Nigeria,0.92,1.13


In [76]:
alt.Chart(then_now_top_countries).mark_line().encode(
    x=alt.Y("season_short", title=""),
    y=alt.Y("perc", title=""),
    facet=alt.Facet("country", columns=5),
).properties(width=100, height=50)

  for col_name, dtype in df.dtypes.iteritems():


In [33]:
players_country

Unnamed: 0,country,season,players,perc
0,Afghanistan,2016-2017,1,0.16
1,Albania,2009-2010,1,0.18
2,Albania,2019-2020,1,0.16
3,Albania,2020-2021,1,0.15
4,Albania,2021-2022,1,0.14
...,...,...,...,...
1572,Zimbabwe,2016-2017,3,0.47
1573,Zimbabwe,2019-2020,1,0.16
1574,Zimbabwe,2020-2021,2,0.31
1575,Zimbabwe,2021-2022,1,0.14


---

#### USA

In [34]:
usa_df = pd.merge(
    players_country[players_country["country"] == "United States"],
    total_players,
    on="season",
)

In [35]:
usa_df["share"] = ((usa_df["players"] / usa_df["all_players"]) * 100).round(2)

In [36]:
bars = alt.Chart(usa_df).mark_bar().encode(x="share", y="season")

text = bars.mark_text(
    align="left",
    baseline="middle",
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
).encode(text="share:Q")

(bars + text).properties(height=500)

  for col_name, dtype in df.dtypes.iteritems():


---

In [37]:
# players_country[players_country["season"] == "2022-2023"].sort_values(
#     "players", ascending=False
# )

In [38]:
# players_country[players_country["country"] == "Brazil"].sort_values(
#     "season", ascending=False
# )