In [16]:
from httpx import AsyncClient
import os
from os import path
import pandas as pd
import sqlite3
from contextlib import contextmanager
from typing import Optional
import re

In [17]:
PARLIMENTARY_SESSIONS = (
    "44-1",
    "43-2",
    "43-1",
    "42-1",
    "41-2",
    "41-1",
    "40-3",
    "40-2",
    "40-1",
    "39-2",
    "39-1",
    "38-1",
)

DATA_DIR = "backend/data"
EXPORT_DB = "repsheet.sqlite"

VOTES_HELD_TABLE = "votes_held"
MEMBER_VOTES_TABLE = "member_votes"
MEMBERS_TABLE = "members"

os.makedirs(DATA_DIR, exist_ok=True)

httpx = AsyncClient()

@contextmanager
def db_connect():
    """Context manager for database connection."""
    db = sqlite3.connect(EXPORT_DB)
    db.row_factory = sqlite3.Row
    try:
        yield db
    finally:
        db.commit()
        db.close()

In [18]:
def print_table_schema(table_name):
    """Print the schema of a given table."""
    with db_connect() as db:
        cursor = db.cursor()
        cursor.execute(f"PRAGMA table_info({table_name})")
        rows = cursor.fetchall()
        for row in rows:
            print(f"{row[1]}: {row[2]}")

def parse_parl_datetime(date_str: str) -> Optional[pd.Timestamp]:
    """Parses strings in parliamentary datetime format, e.g. 2024-12-17 3:50:01 p.m."""
    if not date_str or pd.isna(date_str):
        return None
    date_str = date_str.replace("p.m.", "PM").replace("a.m.", "AM")
    return pd.to_datetime(date_str, format="%Y-%m-%d %I:%M:%S %p").tz_localize("Canada/Eastern")

# Build the sqlite db

## Members of Parliament

### Download members csv

In [19]:
latest_parliament = max(PARLIMENTARY_SESSIONS).split("-")[0]
assert latest_parliament == "44"

filename = f"members-{latest_parliament}.csv"
filepath = path.join(DATA_DIR, filename)
if not path.exists(filepath):
    resp = await httpx.get(f"https://www.ourcommons.ca/Members/en/search/csv?parliament={latest_parliament}&caucusId=all&province=all&gender=all")
    resp.raise_for_status()
    with open(filepath, "wb") as f:
        f.write(resp.content)
    print(f"Downloaded {filename}")

Downloaded members-44.csv


### Insert into sqlite

In [None]:
members = pd.read_csv(filepath)
members["Start Date"] = members["Start Date"].apply(parse_parl_datetime)
members["End Date"] = members["End Date"].apply(parse_parl_datetime)
members["Member ID"] = members.apply(lambda row: f"{row['First Name']} {row["Last Name"]} ({row["Constituency"]})", axis=1)

with db_connect() as db:
    members.to_sql(MEMBERS_TABLE, db, if_exists="replace", index=False)
    db.execute(f"CREATE UNIQUE INDEX IF NOT EXISTS idx_member_id ON {MEMBERS_TABLE} ([Member ID])")
    print(f"Inserted {len(members)} members into {MEMBERS_TABLE} table.")
    
print_table_schema(MEMBERS_TABLE)

Inserted 349 members into members table.
Honorific Title: TEXT
First Name: TEXT
Last Name: TEXT
Constituency: TEXT
Province / Territory: TEXT
Political Affiliation: TEXT
Start Date: TIMESTAMP
End Date: TIMESTAMP
Member ID: TEXT


In [63]:
FULL_MEMBER_NAME_REGEX = re.compile(r"^([^\.]+)\. ([^\(]+)\(([^\)]+)\)$")

def find_member_id(full_member_name: str) -> Optional[str]:
    """Find a member ID from their full name (e.g. Mr. Justin Trudeau (Papineau)). 
    Really flakey matching but if it works it works."""
    match = FULL_MEMBER_NAME_REGEX.match(full_member_name)
    if not match:
        raise ValueError(f"Failed to match full member name: {full_member_name}")
    honorific, member_name, constituency = match.groups()
    member_name = member_name.strip()
    first_name = member_name.split(" ")[0]
    last_name = member_name.split(" ")[-1]
    with db_connect() as db:
        cursor = db.cursor()
        cursor.execute(
            f"SELECT [Member ID] FROM {MEMBERS_TABLE} "
            "WHERE [First Name] LIKE ? AND [Last Name] LIKE ? AND Constituency = ?", 
            (f"{first_name}%", f"%{last_name}", constituency))
        rows = cursor.fetchall()
    if len(rows) > 1:
        raise ValueError(f"Found multiple member IDs for {full_member_name}: {rows}")
    if len(rows) == 0:
        return None
    else:
        assert len(rows) == 1
        return rows[0][0]
    
assert find_member_id("Mr. Justin Trudeau (Papineau)") is not None
assert find_member_id("Mr. Harjit S. Sajjan (Vancouver South)") is not None
assert find_member_id("Ms. Soraya Martinez Ferrada (Hochelaga)") is not None

## Votes held across all parliamentary sessions

### Pull files as csvs

In [32]:
os.makedirs(path.join(DATA_DIR, VOTES_HELD_TABLE), exist_ok=True)

for session in PARLIMENTARY_SESSIONS:
    filename = f"votes-{session}.csv"
    filepath = path.join(DATA_DIR, VOTES_HELD_TABLE, filename)
    if path.exists(filepath):
        # print(f"File {filename} already exists, skipping download.")
        continue
    resp = await httpx.get(f"https://www.ourcommons.ca/Members/en/votes/csv?parlSession={session}")
    resp.raise_for_status()
    with open(filepath, "wb") as f:
        f.write(resp.content)
    print(f"Downloaded {filename}")

### Format and insert into the sqlite db

In [35]:
is_first = True

with db_connect() as db:
    for session in PARLIMENTARY_SESSIONS:
        filename = f"votes-{session}.csv"
        filepath = path.join(DATA_DIR, VOTES_HELD_TABLE, filename)
        assert path.exists(filepath), f"File {filename} does not exist"

        v = pd.read_csv(filepath)

        v["Vote Subject"] = v["Vote Subject"].astype("string")
        v["Vote Result"] = v["Vote Result"].astype("string")
        v["Agreed To"] = v["Vote Result"].apply(lambda x: True if x == "Agreed To" else False)
        v["Bill Number"] = v["Bill Number"].astype("string")
        v["Date"] = v["Date"].apply(parse_parl_datetime)
        v["Vote ID"] = v["Parliament"].astype("string") + "-" + v["Session"].astype("string") + "-" + v["Vote Number"].astype("string")

        for c in v.columns:
            assert v[c].dtype != "object", f"Column {c} is still an object type"

        v.to_sql(VOTES_HELD_TABLE, db, if_exists="replace" if is_first else "append", index=False)
        is_first = False

    db.execute(
        f"CREATE UNIQUE INDEX IF NOT EXISTS idx_session_vote_id ON {VOTES_HELD_TABLE} ([Vote ID])"
    )

    print_table_schema(VOTES_HELD_TABLE)

Parliament: INTEGER
Session: INTEGER
Date: TIMESTAMP
Vote Number: INTEGER
Vote Subject: TEXT
Vote Result: TEXT
Yeas: INTEGER
Nays: INTEGER
Paired: INTEGER
Bill Number: TEXT
Agreed To: INTEGER
Vote ID: TEXT


## Who voted for what

### Download member voting data

In [None]:
with db_connect() as db:
    cursor = db.cursor()
    cursor.execute(f"SELECT Parliament, Session, [Vote Number] FROM {VOTES_HELD_TABLE}")
    rows = cursor.fetchall()
    votes_held = [tuple(row) for row in rows]
    print(f"Total number of votes held: {len(votes_held)}")

os.makedirs(path.join(DATA_DIR, MEMBER_VOTES_TABLE), exist_ok=True)

for parliament, session, vote_number in votes_held:
    url = f"https://www.ourcommons.ca/Members/en/votes/{parliament}/{session}/{vote_number}/csv"
    filename = f"member-votes-{parliament}-{session}-{vote_number}.csv"
    filepath = path.join(DATA_DIR, MEMBER_VOTES_TABLE, filename)
    if path.exists(filepath):
        # print(f"File {filename} already exists, skipping download.")
        continue
    resp = await httpx.get(url)
    resp.raise_for_status()
    with open(filepath, "wb") as f:
        f.write(resp.content)
    print(f"Downloaded {filename}")


Total number of votes held: 4678


### Insert member voting data into the sqlite db

In [49]:
with db_connect() as db:
    for x in db.execute(f"SELECT * FROM {MEMBERS_TABLE} WHERE Constituency = 'Nanaimo—Ladysmith'").fetchall():
        print(dict(x))

{'Honorific Title': None, 'First Name': 'Lisa Marie', 'Last Name': 'Barron', 'Constituency': 'Nanaimo—Ladysmith', 'Province / Territory': 'British Columbia', 'Political Affiliation': 'NDP', 'Start Date': '2021-09-20 00:00:00-04:00', 'End Date': None, 'Member ID': 'Lisa Marie Barron (Nanaimo—Ladysmith)'}


In [64]:
with db_connect() as db:
    bill_vote_ids = [row["Vote ID"] for row in db.execute(f"SELECT [Vote ID] FROM {VOTES_HELD_TABLE} WHERE [Bill Number] IS NOT NULL").fetchall()][:1]
    member_vote_rows = []
    for vote_id in bill_vote_ids:
        filename = f"member-votes-{vote_id}.csv"
        filepath = path.join(DATA_DIR, MEMBER_VOTES_TABLE, filename)
        assert path.exists(filepath), f"File {filename} does not exist"
        v = pd.read_csv(filepath)
        v["Member ID"] = v["Member of Parliament"].apply(find_member_id)


        parliament = vote_id.split("-")[0]
        if parliament == latest_parliament and len(v[v["Member ID"].isna()]) > 0:
            raise ValueError(f"Found members of latest Parliament we could not match to an ID: {v[v["Member ID"].isna()]}")
        


6

## Optimize the sqlite db

In [18]:
with db_connect() as db:
    db.execute("VACUUM")
    db.execute("PRAGMA optimize")
    db.execute("ANALYZE")