# Wikipedia pageviews API: Members of the U.S. House

#### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import requests
import pageviewapi
import datetime as dt
from bs4 import BeautifulSoup
import altair as alt
from tqdm.notebook import tqdm, trange

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
begin = "20200101"
today = dt.datetime.today().strftime("%Y%m%d")

---

## Scrape

#### First, get a list of all members and their wiki article links

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

data_list = []

html = urlopen(
    "https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives"
)
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", {"id": "votingmembers"})

for row in table.find_all("tr")[1:]:
    col = row.find_all("td")
    link = row.find_all("a")

    if (len(col) < 3) | (len(link) < 2):
        data_list.append(
            dict(
                # district=col[0].text,
                name=col[0],
                party="Vacant",
                slug="none",
            )
        )
    else:
        data_list.append(
            dict(
                # district=col[1].text.strip("\n"),
                name=col[0].text.strip("\n"),
                party=col[2].text.strip("\n"),
                slug=link[2]["href"].strip("/wiki/"),
            )
        )

#### Into a dataframe

In [6]:
src = pd.DataFrame(data_list)

#### Get their districts from the wikitable (they have a diffent structure on the page)

In [7]:
district_list = []

for row in table.find_all("th", {"scope": "row"}):
    links = row.find_all("a")[0].text.replace("\xa0", " ")
    district_list.append(links)

#### Add the districts to the members dataframe

In [8]:
src["district"] = district_list

#### Next, pull monthly pageviews data using the wikipedia API

In [10]:
term_dict = []

for a in tqdm(src["slug"]):
    try:
        term_dict.append(
            pageviewapi.per_article(
                "en.wikipedia",
                a,
                begin,
                today,
                access="all-access",
                agent="all-agents",
                granularity="monthly",
            )
        )
    except pageviewapi.client.ZeroOrDataNotLoadedException as e:
        views = 0

---

## Process

#### Create a dataframe and stack each member's data into it

In [11]:
views_list = []

for term in term_dict:
    for t in term["items"]:
        views_list.append(t)

In [12]:
views = pd.DataFrame(views_list)

#### Clean up date

In [58]:
views["date"] = pd.to_datetime(views["timestamp"].str.strip("00"), format="%Y%m%d")

#### Clean dataframe

In [59]:
df = (
    pd.merge(src, views, left_on="slug", right_on="article")
    .drop(["article", "agent", "access", "project", "timestamp"], axis=1)
    .sort_values("views", ascending=False)
    .reset_index(drop=True)
)

#### Find one member's figures

In [61]:
df["name"] = df["name"].str.strip()

In [62]:
df[df["name"] == "Lauren Boebert"].head()

Unnamed: 0,name,party,slug,district,granularity,views,date
7,Lauren Boebert,Republican,Lauren_Boebert,Colorado 3,monthly,1232625,2022-11-01
8,Lauren Boebert,Republican,Lauren_Boebert,Colorado 3,monthly,1222941,2023-09-01
10,Lauren Boebert,Republican,Lauren_Boebert,Colorado 3,monthly,1213544,2021-01-01
26,Lauren Boebert,Republican,Lauren_Boebert,Colorado 3,monthly,593522,2023-01-01
41,Lauren Boebert,Republican,Lauren_Boebert,Colorado 3,monthly,412014,2022-03-01


---

## Aggregate 

#### By member

In [54]:
members_grp = (
    df.groupby(["name", "party"])
    .agg({"views": sum})
    .sort_values("views", ascending=False)
    .reset_index()
)

In [55]:
members_grp.head(10)

Unnamed: 0,name,party,views
0,Alexandria Ocasio-Cortez,Democratic,17689897
1,Marjorie Taylor Greene,Republican,11260832
2,Lauren Boebert,Republican,10390784
3,Matt Gaetz,Republican,9216695
4,Ilhan Omar,Democratic,5716492
5,Kevin McCarthy,Republican,4831307
6,Hakeem Jeffries,Democratic,3519914
7,Adam Schiff,Democratic,3244530
8,Jamie Raskin,Democratic,3059760
9,Rashida Tlaib,Democratic,2908607


In [63]:
most_discussed = (
    members_grp.sort_values("views", ascending=False)["name"].head(10).to_list()
)
most_discussed

['Alexandria Ocasio-Cortez',
 'Marjorie Taylor Greene',
 'Lauren Boebert',
 'Matt Gaetz',
 'Ilhan Omar',
 'Kevin McCarthy',
 'Hakeem Jeffries',
 'Adam Schiff',
 'Jamie Raskin',
 'Rashida Tlaib']

#### By party

In [67]:
party_grp = df.groupby(["date", "party"]).agg({"views": sum}).reset_index()

In [68]:
party_grp_pivot = party_grp.pivot_table(
    columns="party", index="date", values="views"
).reset_index()

In [69]:
party_grp_pivot.columns = party_grp_pivot.columns.str.lower()

In [73]:
party_grp_pivot

In [77]:
mean_democrat = party_grp_pivot["democratic"].mean().round()
mean_democrat

2389791.0

In [78]:
mean_republican = party_grp_pivot["republican"].mean().round()
mean_republican

2176325.0

---

## Exports

#### Monthly to json

In [None]:
df.to_json("output/members_pageviews_daily.json", indent=2, orient="records")

#### Monthly to csv

In [None]:
df.to_csv("output/members_pageviews_daily.csv", index=False)