# Wikipedia pageviews API: Members of the U.S. House

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
import pageviewapi
import datetime as dt
from bs4 import BeautifulSoup
import altair as alt

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
begin = "20210101"
today = dt.datetime.today().strftime("%Y%m%d")

---

### First, get a list of all members and their wiki article links

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

data_list = []

html = urlopen(
    "https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives"
)
soup = BeautifulSoup(html, "html.parser")
table = soup.find_all("table", {"class": "wikitable sortable"})[2]

for row in table.find_all("tr")[1:]:
    col = row.find_all("td")
    link = row.find_all("a")

    if (len(col) < 3) | (len(link) < 2):
        data_list.append(
            dict(district=col[0].text, name=col[1], party="Vacant", slug="none")
        )
    else:
        data_list.append(
            dict(
                district=col[0].text.strip("\n"),
                name=col[1],
                party=col[3].text.strip("\n"),
                slug=link[2]["href"].strip("/wiki/"),
            )
        )

IndexError: list index out of range

In [None]:
src = pd.DataFrame(data_list)

### Pull daily data for this year

In [None]:
term_dict = []

for a in src["slug"]:
    try:
        term_dict.append(
            pageviewapi.per_article(
                "en.wikipedia",
                a,
                begin,
                today,
                access="all-access",
                agent="all-agents",
                granularity="daily",
            )
        )
    except pageviewapi.client.ZeroOrDataNotLoadedException as e:
        views = 0

### Create a dataframe and stack each member's data into it

In [None]:
src_df = pd.DataFrame()

for t in term_dict:
    src_df = src_df.append(t["items"])

### Who's had the most views? 

In [None]:
src_df.sort_values("views", ascending=False).head()

### Find one member's figures?

In [None]:
src_df[src_df["article"] == "Liz_Cheney"].head()

### Clean up date and member names

In [None]:
src_df["date"] = pd.to_datetime(src_df["timestamp"].str.strip("00"), format="%Y%m%d")
src_df["member"] = src_df["article"].str.replace("_", " ", regex=False)

In [None]:
merged = src_df.merge(src[["slug", "party"]], left_on="article", right_on="slug")

### Lose the fields we don't need

In [None]:
merged[["member", "trash"]] = merged["member"].str.split(pat="(", expand=True)

In [None]:
df = merged[["date", "member", "views", "party"]].copy()

In [None]:
df["member"] = (
    df["member"]
    .str.replace("'s", "", regex=False)
    .str.replace(" congressional district", "", regex=False)
)

### MTG

In [None]:
mtg = df[df["member"] == "Marjorie Taylor Greene"].reset_index(drop=True)
gaetz = df[df["member"] == "Matt Gaetz"].reset_index(drop=True)

In [None]:
mtg.head()

In [None]:
gaetz.head()

---

### Aggregate by member

In [None]:
df.head()

In [None]:
members_grp = df.groupby(["member", "party"]).agg({"views": sum}).reset_index()

In [None]:
members_grp.sort_values("views", ascending=False).head()

In [None]:
most_discussed = (
    members_grp.sort_values("views", ascending=False)["member"].head(10).to_list()
)

In [None]:
most_discussed

---

### Aggregate by party

In [None]:
party_grp = df.groupby(["date", "party"]).agg({"views": sum}).reset_index()

In [None]:
party_grp_pivot = party_grp.pivot_table(
    columns="party", index="date", values="views"
).reset_index()

In [None]:
party_grp_pivot.columns = party_grp_pivot.columns.str.lower()

In [None]:
party_grp_pivot.drop("vacant", axis=1, inplace=True)

---

### Exports

In [None]:
# daily to json
df.to_json("output/members_pageviews_daily.json", indent=2, orient="records")

In [None]:
# daily to csv
df.to_csv("output/members_pageviews_daily.csv", index=False)

In [None]:
# all of 2021 to csv
members_grp.to_csv("output/members_pageviews_totals.csv", index=False)

In [None]:
# all of 2021 to json
members_grp.to_json("output/members_pageviews_totals.json", indent=2, orient="records")

In [None]:
# all of 2021 to csv > 500,000 views
members_grp[members_grp["views"] > 500000].to_csv(
    "output/members_pageviews_totals_500k.csv", index=False
)

In [None]:
party_grp_pivot.to_csv("output/party_pageviews_by_day.csv", index=False)

In [None]:
mtg.to_csv("output/mtg_pageviews_by_day.csv", index=False)