# Wikipedia pageviews API: Members of the U.S. House

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
import pageviewapi
import datetime as dt
from bs4 import BeautifulSoup
import altair as alt

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
begin = "20210101"
today = dt.datetime.today().strftime("%Y%m%d")

---

### First, get a list of all members and their wiki article links

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

data_list = []

html = urlopen(
    "https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives"
)
soup = BeautifulSoup(html, "html.parser")
table = soup.find_all("table", {"class": "wikitable sortable"})[2]

for row in table.find_all("tr")[1:]:
    col = row.find_all("td")
    link = row.find_all("a")

    if (len(col) < 3) | (len(link) < 2):
        data_list.append(
            dict(district=col[0].text, name=col[1], party="Vacant", slug="none")
        )
    else:
        data_list.append(
            dict(
                district=col[0].text.strip("\n"),
                name=col[1],
                party=col[3].text.strip("\n"),
                slug=link[2]["href"].strip("/wiki/"),
            )
        )

In [6]:
src = pd.DataFrame(data_list)

### Pull daily data for this year

In [7]:
term_dict = []

for a in src["slug"]:
    try:
        term_dict.append(
            pageviewapi.per_article(
                "en.wikipedia",
                a,
                begin,
                today,
                access="all-access",
                agent="all-agents",
                granularity="daily",
            )
        )
    except pageviewapi.client.ZeroOrDataNotLoadedException as e:
        views = 0

### Create a dataframe and stack each member's data into it

In [8]:
src_df = pd.DataFrame()

for t in term_dict:
    src_df = src_df.append(t["items"])

### Who's had the most views? 

In [9]:
src_df.sort_values("views", ascending=False).head()

Unnamed: 0,project,article,granularity,timestamp,access,agent,views
89,en.wikipedia,Matt_Gaetz,daily,2021033100,all-access,all-agents,540592
12,en.wikipedia,Liz_Cheney,daily,2021011300,all-access,all-agents,366312
131,en.wikipedia,Liz_Cheney,daily,2021051200,all-access,all-agents,365664
35,en.wikipedia,Marjorie_Taylor_Greene,daily,2021020500,all-access,all-agents,359265
91,en.wikipedia,Matt_Gaetz,daily,2021040200,all-access,all-agents,308809


### Find one member's figures?

In [10]:
src_df[src_df["article"] == "Liz_Cheney"].head()

Unnamed: 0,project,article,granularity,timestamp,access,agent,views
0,en.wikipedia,Liz_Cheney,daily,2021010100,all-access,all-agents,1646
1,en.wikipedia,Liz_Cheney,daily,2021010200,all-access,all-agents,2044
2,en.wikipedia,Liz_Cheney,daily,2021010300,all-access,all-agents,5193
3,en.wikipedia,Liz_Cheney,daily,2021010400,all-access,all-agents,14474
4,en.wikipedia,Liz_Cheney,daily,2021010500,all-access,all-agents,10838


### Clean up date and member names

In [11]:
src_df["date"] = pd.to_datetime(src_df["timestamp"].str.strip("00"), format="%Y%m%d")
src_df["member"] = src_df["article"].str.replace("_", " ", regex=False)

In [12]:
merged = src_df.merge(src[["slug", "party"]], left_on="article", right_on="slug")

### Lose the fields we don't need

In [13]:
merged[["member", "trash"]] = merged["member"].str.split(pat="(", expand=True)

In [14]:
df = merged[["date", "member", "views", "party"]].copy()

In [15]:
df["member"] = (
    df["member"]
    .str.replace("'s", "", regex=False)
    .str.replace(" congressional district", "", regex=False)
)

### MTG

In [47]:
mtg = df[df["member"] == "Marjorie Taylor Greene"].reset_index(drop=True)
gaetz = df[df["member"] == "Matt Gaetz"].reset_index(drop=True)

In [54]:
mtg.head()

Unnamed: 0,date,member,views,party
0,2021-01-01,Marjorie Taylor Greene,1629,Republican
1,2021-01-02,Marjorie Taylor Greene,1785,Republican
2,2021-01-03,Marjorie Taylor Greene,9230,Republican
3,2021-01-04,Marjorie Taylor Greene,13623,Republican
4,2021-01-05,Marjorie Taylor Greene,18968,Republican


In [48]:
gaetz.head()

Unnamed: 0,date,member,views,party
0,2021-01-01,Matt Gaetz,5003,Republican
1,2021-01-02,Matt Gaetz,3510,Republican
2,2021-01-03,Matt Gaetz,2288,Republican
3,2021-01-04,Matt Gaetz,3027,Republican
4,2021-01-05,Matt Gaetz,3693,Republican


---

### Aggregate by member

In [16]:
df.head()

In [17]:
members_grp = df.groupby(["member", "party"]).agg({"views": sum}).reset_index()

In [18]:
members_grp.sort_values("views", ascending=False).head()

Unnamed: 0,member,party,views
264,Marjorie Taylor Greene,Republican,4875798
273,Matt Gaetz,Republican,4010896
11,Alexandria Ocasio-Cortez,Democratic,3918370
246,Liz Cheney,Republican,3658324
241,Lauren Boebert,Republican,2840268


In [19]:
most_discussed = (
    members_grp.sort_values("views", ascending=False)["member"].head(10).to_list()
)

In [20]:
most_discussed

['Marjorie Taylor Greene',
 'Matt Gaetz',
 'Alexandria Ocasio-Cortez',
 'Liz Cheney',
 'Lauren Boebert',
 'Madison Cawthorn',
 'Jamie Raskin',
 'Adam Kinzinger',
 'Ilhan Omar',
 'Kevin McCarthy ']

---

### Aggregate by party

In [32]:
party_grp = df.groupby(["date", "party"]).agg({"views": sum}).reset_index()

In [38]:
party_grp_pivot = party_grp.pivot_table(
    columns="party", index="date", values="views"
).reset_index()

In [40]:
party_grp_pivot.columns = party_grp_pivot.columns.str.lower()

In [42]:
party_grp_pivot.drop("vacant", axis=1, inplace=True)

---

### Exports

In [22]:
# daily to json
df.to_json("output/members_pageviews_daily.json", indent=2, orient="records")

In [23]:
# daily to csv
df.to_csv("output/members_pageviews_daily.csv", index=False)

In [24]:
# all of 2021 to csv
members_grp.to_csv("output/members_pageviews_totals.csv", index=False)

In [25]:
# all of 2021 to json
members_grp.to_json("output/members_pageviews_totals.json", indent=2, orient="records")

In [26]:
# all of 2021 to csv > 500,000 views
members_grp[members_grp["views"] > 500000].to_csv(
    "output/members_pageviews_totals_500k.csv", index=False
)

In [44]:
party_grp_pivot.to_csv("output/party_pageviews_by_day.csv", index=False)

In [43]:
mtg.to_csv("output/mtg_pageviews_by_day.csv", index=False)