# Wikipedia pageviews API: English football clubs

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [1]:
import pandas as pd
import requests
import pageviewapi
import datetime as dt
from bs4 import BeautifulSoup
import altair as alt
import altair_latimes as lat

In [4]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

In [5]:
today = dt.datetime.today().strftime("%Y%m%d")

### Get premier league clubs

In [6]:
# https://en.wikipedia.org/wiki/2020%E2%80%9321_Premier_League

In [7]:
clubs = [
    "Arsenal_F.C.",
    "Aston_Villa_F.C.",
    "Brighton_%26_Hove_Albion_F.C.",
    "Burnley_F.C.",
    "Chelsea_F.C.",
    "Crystal_Palace_F.C.",
    "Everton_F.C.",
    "Fulham_F.C.",
    "Leeds_United_F.C.",
    "Leicester_City_F.C.",
    "Liverpool_F.C.",
    "Manchester_City_F.C.",
    "Manchester_United_F.C.",
    "Newcastle_United_F.C.",
    "Sheffield_United_F.C.",
    "Southampton_F.C.",
    "Tottenham_Hotspur_F.C.",
    "West_Bromwich_Albion_F.C.",
    "West_Ham_United_F.C.",
    "Wolverhampton_Wanderers_F.C.",
]

In [8]:
articles = [t for t in clubs]

In [9]:
# https://github.com/Commonists/pageview-api

### Pull daily data from the start of the season to today

In [10]:
begin = "20200912"
end = "20210523"

term_dict = []

for a in articles:
    term_dict.append(
        pageviewapi.per_article(
            "en.wikipedia",
            a,
            begin,
            today,
            access="all-access",
            agent="all-agents",
            granularity="daily",
        )
    )

### Create a dataframe and stack each clubs data into it

In [11]:
df = pd.DataFrame()

for t in term_dict:
    df = df.append(t["items"])

In [12]:
df.head()

Unnamed: 0,project,article,granularity,timestamp,access,agent,views
0,en.wikipedia,Arsenal_F.C.,daily,2020091200,all-access,all-agents,33516
1,en.wikipedia,Arsenal_F.C.,daily,2020091300,all-access,all-agents,17922
2,en.wikipedia,Arsenal_F.C.,daily,2020091400,all-access,all-agents,14769
3,en.wikipedia,Arsenal_F.C.,daily,2020091500,all-access,all-agents,15256
4,en.wikipedia,Arsenal_F.C.,daily,2020091600,all-access,all-agents,16964


### Clean up date and club names

In [13]:
df["date"] = pd.to_datetime(df["timestamp"].str.strip("00"), format="%Y%m%d")
df["club"] = (
    df["article"]
    .str.replace("_", " ", regex=False)
    .str.replace(" F.C.", "", regex=False)
)

### Lose the fields we don't need

In [14]:
clubs_df = df[["date", "club", "views"]]

In [15]:
clubs_df.head()

Unnamed: 0,date,club,views
0,2020-09-12,Arsenal,33516
1,2020-09-13,Arsenal,17922
2,2020-09-14,Arsenal,14769
3,2020-09-15,Arsenal,15256
4,2020-09-16,Arsenal,16964


### Chart the daily page views, by team

In [16]:
alt.Chart(clubs_df).mark_area(color="#82c6df").encode(
    x=alt.X("date", axis=alt.Axis(tickCount=2, format="%b. %Y", grid=False), title=" "),
    y=alt.Y(
        "views",
        title=" ",
        axis=alt.Axis(
            tickSize=0,
            domainOpacity=0,
            tickCount=3,
            offset=4,
            gridWidth=0.6,
            gridColor="#dddddd",
        ),
    ),
    facet=alt.Facet("club", columns=5, title=" "),
).properties(
    width=200,
    height=100,
    title="Daily pageviews on Wikipedia pages for Premier League clubs",
).configure_view(
    strokeOpacity=0
).configure_axis(
    labelFontSize=12
)

---

### Aggregate by club

In [17]:
clubs_grp = clubs_df.groupby(["club"]).agg({"views": sum}).reset_index()

In [18]:
clubs_grp.sort_values("views", ascending=False)

Unnamed: 0,club,views
12,Manchester United,4561526
4,Chelsea,3928596
10,Liverpool,3212136
11,Manchester City,3044509
0,Arsenal,3036657
16,Tottenham Hotspur,2806915
9,Leicester City,1768631
8,Leeds United,1552345
6,Everton,1359939
1,Aston Villa,1280574


### Which clubs got the most page views during the season?

In [19]:
alt.Chart(clubs_grp).mark_bar(color="#82c6df").encode(
    x=alt.X(
        "views",
        title=" ",
        axis=alt.Axis(
            tickSize=0,
            domainOpacity=0,
            tickCount=3,
            offset=4,
            gridWidth=0.6,
            gridColor="#dddddd",
        ),
    ),
    y=alt.Y(
        "club",
        sort="-x",
        title=" ",
        axis=alt.Axis(
            tickSize=0,
            domainOpacity=0,
            tickCount=3,
            offset=4,
            gridWidth=0.6,
            gridColor="#dddddd",
        ),
    ),
    color=alt.condition(
        alt.datum.club == "Leeds United",
        alt.value("steelblue"),  # which sets the bar orange.
        alt.value("#82c6df"),  # And if it's not true it sets the bar steelblue.
    ),
).properties(
    width=600,
    height=400,
    title="Daily pageviews on Wikipedia pages for Premier League clubs",
).configure_view(
    strokeOpacity=0
).configure_axis(
    labelFontSize=12
)

---

### Wikipedia views vs. actual points in league

In [20]:
tables = pd.read_html("https://en.wikipedia.org/wiki/2020%E2%80%9321_Premier_League")

In [21]:
table = tables[4]

In [22]:
table.columns = table.columns.str.lower()

In [23]:
table["team"] = (
    table["team"]
    .str.replace(" (C)", "", regex=False)
    .str.replace(" (R)", "", regex=False)
)

In [24]:
table.head()

Unnamed: 0,pos,team,pld,w,d,l,gf,ga,gd,pts,qualification or relegation
0,1,Manchester City,38,27,5,6,83,32,51,86,Qualification for the Champions League group stage
1,2,Manchester United,38,21,11,6,73,44,29,74,Qualification for the Champions League group stage
2,3,Liverpool,38,20,9,9,68,42,26,69,Qualification for the Champions League group stage
3,4,Chelsea,38,19,10,9,58,36,22,67,Qualification for the Champions League group stage
4,5,Leicester City,38,20,6,12,68,50,18,66,Qualification for the Europa League group stage[a]


In [25]:
merge = pd.merge(table, clubs_grp, left_on="team", right_on="club")

In [26]:
merge[["pts", "views"]].corr(method="pearson")

Unnamed: 0,pts,views
pts,1.0,0.775934
views,0.775934,1.0


In [27]:
merge[["pts", "views"]].dtypes

pts      int64
views    int64
dtype: object

In [28]:
points = (
    alt.Chart(merge)
    .mark_circle(size=100)
    .encode(
        x=alt.X(
            "pts:Q",
            title="League points",
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                grid=False,
                gridColor="#dddddd",
            ),
        ),
        y=alt.Y(
            "views:Q",
            title=" ",
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
        ),
    )
)

text = points.mark_text(align="left", baseline="middle", dx=7).encode(text="team")

(points + text).properties(
    width=800,
    height=600,
    title="Daily pageviews on Wikipedia pages vs. Premier League points",
).configure_view(strokeOpacity=0).configure_axis(labelFontSize=12)

### Exports

In [29]:
# daily to json
clubs_df.to_json(
    "data/processed/clubs_pageviews_daily.json", indent=2, orient="records"
)

In [30]:
# daily to csv
clubs_df.to_csv("data/processed/clubs_pageviews_daily.csv", index=False)

In [31]:
# season total
clubs_grp.to_csv("data/processed/clubs_pageviews_all_season.csv", index=False)