# Wikipedia pageviews API: Members of the U.S. House of Representatives

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
import pageviewapi
import datetime as dt
from bs4 import BeautifulSoup
import altair as alt
import altair_latimes as lat
import re

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

In [4]:
today = dt.datetime.today().strftime("%Y%m%d")

---

### First, get a list of all members and their wiki article links

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

df = []

html = urlopen(
    "https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives"
)
soup = BeautifulSoup(html, "html.parser")
table = soup.find_all("table", {"class": "wikitable sortable"})[2]

for row in table.find_all("tr")[1:]:
    col = row.find_all("td")
    df.append(dict(district=col[0], name=col[1]))

In [6]:
src = pd.DataFrame(df)
src.head()

Unnamed: 0,district,name
0,"[[[Alabama 1]], \n]","[[[]], [], [[Jerry Carl]], \n]"
1,"[[[Alabama 2]], \n]","[[[]], [], [[Barry Moore]], \n]"
2,"[[[Alabama 3]], \n]","[[[]], [], [[Mike Rogers]], \n]"
3,"[[[Alabama 4]], \n]","[[[]], [], [[Robert Aderholt]], \n]"
4,"[[[Alabama 5]], \n]","[[[]], [], [[Mo Brooks]], \n]"


In [7]:
df[0]

{'district': <td><span data-sort-value="Alabama01 !"><a href="/wiki/Alabama%27s_1st_congressional_district" title="Alabama's 1st congressional district">Alabama 1</a></span>
 </td>,
 'name': <td data-sort-value="Carl, Jerry"><a class="image" href="/wiki/File:Jerry_Carl_117th_U.S_Congress_(cropped).jpg"><img alt="Jerry Carl 117th U.S Congress (cropped).jpg" data-file-height="1298" data-file-width="1166" decoding="async" height="83" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Jerry_Carl_117th_U.S_Congress_%28cropped%29.jpg/75px-Jerry_Carl_117th_U.S_Congress_%28cropped%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Jerry_Carl_117th_U.S_Congress_%28cropped%29.jpg/113px-Jerry_Carl_117th_U.S_Congress_%28cropped%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/1d/Jerry_Carl_117th_U.S_Congress_%28cropped%29.jpg/150px-Jerry_Carl_117th_U.S_Congress_%28cropped%29.jpg 2x" width="75"/></a><br/><b><a href="/wiki/Jerry_Carl" title="Jerry Carl">Jerry Carl

### Then get the urls into a list

In [36]:
articles = []

for l in src["name"]:
    if l.find("a") == None:
        """"""
    else:
        articles.append(l.find_all("a")[1]["href"].strip("/wiki/"))

In [37]:
for a in articles[0:10]:
    print(a)

Jerry_Carl
Barry_Moore_(Alabama_politician)
Mike_Rogers_(Alabama_politician)
Robert_Aderholt
Mo_Brooks
Gary_Palmer_(politician)
Terri_Sewell
Don_Young
Tom_O%27Halleran
Ann_Kirkpatric


### Pull daily data for this year

In [49]:
begin = "20210101"
end = "20210523"

term_dict = []

for a in articles:
    try:
        term_dict.append(
            pageviewapi.per_article(
                "en.wikipedia",
                a,
                begin,
                today,
                access="all-access",
                agent="all-agents",
                granularity="daily",
            )
        )
    except pageviewapi.client.ZeroOrDataNotLoadedException as e:
        views = 0

### Create a dataframe and stack each member's data into it

In [50]:
df = pd.DataFrame()

for t in term_dict:
    df = df.append(t["items"])

In [51]:
df.sort_values("views", ascending=False)

Unnamed: 0,project,article,granularity,timestamp,access,agent,views
89,en.wikipedia,Matt_Gaetz,daily,2021033100,all-access,all-agents,540592
12,en.wikipedia,Liz_Cheney,daily,2021011300,all-access,all-agents,366312
131,en.wikipedia,Liz_Cheney,daily,2021051200,all-access,all-agents,365664
35,en.wikipedia,Marjorie_Taylor_Greene,daily,2021020500,all-access,all-agents,359265
91,en.wikipedia,Matt_Gaetz,daily,2021040200,all-access,all-agents,308809
...,...,...,...,...,...,...,...
65,en.wikipedia,Jim_Baird_(American_politician),daily,2021030700,all-access,all-agents,9
146,en.wikipedia,Lizzie_Pannill_Fletcher,daily,2021052700,all-access,all-agents,8
144,en.wikipedia,Jim_Baird_(American_politician),daily,2021052500,all-access,all-agents,7
78,en.wikipedia,Lizzie_Pannill_Fletcher,daily,2021032000,all-access,all-agents,7


### Clean up date and member names

In [52]:
df["date"] = pd.to_datetime(df["timestamp"].str.strip("00"), format="%Y%m%d")
df["member"] = df["article"].str.replace("_", " ", regex=False)

### Lose the fields we don't need

In [53]:
members_df = df[["date", "member", "views"]].copy()

In [54]:
members_df["member"] = (
    members_df["member"]
    .str.replace("'s", "", regex=False)
    .str.replace(" congressional district", "", regex=False)
)

In [55]:
members_df.head()

Unnamed: 0,date,member,views
0,2021-01-01,Jerry Carl,167
1,2021-01-02,Jerry Carl,259
2,2021-01-03,Jerry Carl,843
3,2021-01-04,Jerry Carl,771
4,2021-01-05,Jerry Carl,415


### Chart the daily page views, by district

In [56]:
alt.Chart(members_df[members_df["member"] != "Texas 6th"]).mark_area(
    color="#82c6df"
).encode(
    x=alt.X("date", axis=alt.Axis(tickCount=2, format="%b. %Y", grid=False), title=" "),
    y=alt.Y(
        "views",
        title=" ",
        axis=alt.Axis(
            tickSize=0,
            domainOpacity=0,
            tickCount=3,
            offset=4,
            gridWidth=0.6,
            gridColor="#dddddd",
        ),
    ),
    facet=alt.Facet("member", columns=5, title=" "),
).properties(
    width=200,
    height=100,
    title="Daily pageviews on Wikipedia pages for U.S. House members",
).configure_view(
    strokeOpacity=0
).configure_axis(
    labelFontSize=12
)

---

### Aggregate by member

In [57]:
members_grp = members_df.groupby(["member"]).agg({"views": sum}).reset_index()

In [58]:
members_grp.sort_values("views", ascending=False)

Unnamed: 0,member,views
268,Marjorie Taylor Greene,3942113
277,Matt Gaetz,3258647
249,Liz Cheney,3061846
11,Alexandria Ocasio-Cortez,2507926
243,Lauren Boebert,1907759
260,Madison Cawthorn,1675934
179,Jamie Raskin,1404630
1,Adam Kinzinger,991997
237,Kevin McCarthy (California politician),887854
169,Ilhan Omar,739689


### Which members got the most page views during the season?

In [80]:
alt.Chart(members_grp.sort_values("views", ascending=True).head(10)).mark_bar(
    color="#82c6df"
).encode(
    x=alt.X(
        "views",
        title=" ",
        axis=alt.Axis(
            tickSize=0,
            domainOpacity=0,
            tickCount=3,
            offset=4,
            gridWidth=0.6,
            gridColor="#dddddd",
        ),
    ),
    y=alt.Y(
        "member",
        sort="-x",
        title=" ",
        axis=alt.Axis(
            tickSize=0,
            domainOpacity=0,
            tickCount=3,
            offset=4,
            gridWidth=0.6,
            gridColor="#dddddd",
        ),
    ),
    color=alt.condition(
        alt.datum.member == "",
        alt.value("steelblue"),  # which sets the bar orange.
        alt.value("#82c6df"),  # And if it's not true it sets the bar steelblue.
    ),
).properties(
    width=600,
    height=300,
    title="Top 10: Least pageviews on Wikipedia articles about U.S. House members in 2021",
).configure_view(
    strokeOpacity=0
).configure_axis(
    labelFontSize=12
)

---

### Exports

In [None]:
# daily to json
members_df.to_json(
    "data/processed/members_pageviews_daily.json", indent=2, orient="records"
)

In [None]:
# daily to csv
members_df.to_csv("data/processed/members_pageviews_daily.csv", index=False)

In [None]:
# season total
members_grp.to_csv("data/processed/members_pageviews_all_season.csv", index=False)

In [None]:
df = pd.DataFrame()

for t in term_dict:
    df = df.append(t["items"])

In [None]:
df.head()

### Clean up date and member names

In [None]:
df["date"] = pd.to_datetime(df["timestamp"].str.strip("00"), format="%Y%m%d")
df["member"] = (
    df["article"]
    .str.replace("_", " ", regex=False)
    .str.replace(" F.C.", "", regex=False)
)

### Lose the fields we don't need

In [None]:
members_df = df[["date", "member", "views"]]

In [None]:
members_df.head()

### Chart the daily page views, by team

In [None]:
alt.Chart(members_df).mark_area(color="#82c6df").encode(
    x=alt.X("date", axis=alt.Axis(tickCount=2, format="%b. %Y", grid=False), title=" "),
    y=alt.Y(
        "views",
        title=" ",
        axis=alt.Axis(
            tickSize=0,
            domainOpacity=0,
            tickCount=3,
            offset=4,
            gridWidth=0.6,
            gridColor="#dddddd",
        ),
    ),
    facet=alt.Facet("member", columns=5, title=" "),
).properties(
    width=200,
    height=100,
    title="Daily pageviews on Wikipedia pages for Premier League members",
).configure_view(
    strokeOpacity=0
).configure_axis(
    labelFontSize=12
)

---

### Aggregate by member

In [None]:
members_grp = members_df.groupby(["member"]).agg({"views": sum}).reset_index()

In [None]:
members_grp.sort_values("views", ascending=False)

### Which members got the most page views during the season?

In [None]:
alt.Chart(members_grp).mark_bar(color="#82c6df").encode(
    x=alt.X(
        "views",
        title=" ",
        axis=alt.Axis(
            tickSize=0,
            domainOpacity=0,
            tickCount=3,
            offset=4,
            gridWidth=0.6,
            gridColor="#dddddd",
        ),
    ),
    y=alt.Y(
        "member",
        sort="-x",
        title=" ",
        axis=alt.Axis(
            tickSize=0,
            domainOpacity=0,
            tickCount=3,
            offset=4,
            gridWidth=0.6,
            gridColor="#dddddd",
        ),
    ),
    color=alt.condition(
        alt.datum.member == "Leeds United",
        alt.value("steelblue"),  # which sets the bar orange.
        alt.value("#82c6df"),  # And if it's not true it sets the bar steelblue.
    ),
).properties(
    width=600,
    height=400,
    title="Daily pageviews on Wikipedia pages for Premier League members",
).configure_view(
    strokeOpacity=0
).configure_axis(
    labelFontSize=12
)

---

### Wikipedia views vs. actual points in league

In [None]:
tables = pd.read_html("https://en.wikipedia.org/wiki/2020%E2%80%9321_Premier_League")

In [None]:
table = tables[4]

In [None]:
table.columns = table.columns.str.lower()

In [None]:
table["team"] = (
    table["team"]
    .str.replace(" (C)", "", regex=False)
    .str.replace(" (R)", "", regex=False)
)

In [None]:
table.head()

In [None]:
merge = pd.merge(table, members_grp, left_on="team", right_on="member")

In [None]:
merge[["pts", "views"]].corr(method="pearson")

In [None]:
merge[["pts", "views"]].dtypes

In [None]:
points = (
    alt.Chart(merge)
    .mark_circle(size=100)
    .encode(
        x=alt.X(
            "pts:Q",
            title="League points",
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                grid=False,
                gridColor="#dddddd",
            ),
        ),
        y=alt.Y(
            "views:Q",
            title=" ",
            axis=alt.Axis(
                tickSize=0,
                domainOpacity=0,
                tickCount=6,
                offset=4,
                gridWidth=0.6,
                gridColor="#dddddd",
            ),
        ),
    )
)

text = points.mark_text(align="left", baseline="middle", dx=7).encode(text="team")

(points + text).properties(
    width=800,
    height=600,
    title="Daily pageviews on Wikipedia pages vs. Premier League points",
).configure_view(strokeOpacity=0).configure_axis(labelFontSize=12)

### Exports

In [None]:
# daily to json
members_df.to_json(
    "data/processed/members_pageviews_daily.json", indent=2, orient="records"
)

In [None]:
# daily to csv
members_df.to_csv("data/processed/members_pageviews_daily.csv", index=False)

In [None]:
# season total
members_grp.to_csv("data/processed/members_pageviews_all_season.csv", index=False)