In [22]:
from httpx import AsyncClient
import os
from os import path
import pandas as pd
import sqlite3
from contextlib import contextmanager
from typing import Optional

In [28]:
PARLIMENTARY_SESSIONS = (
    "44-1",
    "43-2",
    "43-1",
    "42-1",
    "41-2",
    "41-1",
    "40-3",
    "40-2",
    "40-1",
    "39-2",
    "39-1",
    "38-1",
)

DATA_DIR = "backend/data"
EXPORT_DB = "repsheet.sqlite"

VOTES_HELD_TABLE = "votes_held"
MEMBER_VOTES_TABLE = "member_votes"
MEMBERS_TABLE = "members"

os.makedirs(DATA_DIR, exist_ok=True)

httpx = AsyncClient()

@contextmanager
def db_connect():
    """Context manager for database connection."""
    db = sqlite3.connect(EXPORT_DB)
    db.row_factory = sqlite3.Row
    try:
        yield db
    finally:
        db.commit()
        db.close()

In [26]:
def print_table_schema(table_name):
    """Print the schema of a given table."""
    with db_connect() as db:
        cursor = db.cursor()
        cursor.execute(f"PRAGMA table_info({table_name})")
        rows = cursor.fetchall()
        for row in rows:
            print(f"{row[1]}: {row[2]}")

def parse_parl_datetime(date_str: str) -> Optional[pd.Timestamp]:
    """Parses strings in parliamentary datetime format, e.g. 2024-12-17 3:50:01 p.m."""
    if not date_str or pd.isna(date_str):
        return None
    date_str = date_str.replace("p.m.", "PM").replace("a.m.", "AM")
    return pd.to_datetime(date_str, format="%Y-%m-%d %I:%M:%S %p").tz_localize("Canada/Eastern")

# Build the sqlite db

## Members of Parliament

### Download members csv

In [19]:
latest_parliament = max(PARLIMENTARY_SESSIONS).split("-")[0]
assert latest_parliament == "44"

filename = f"members-{latest_parliament}.csv"
filepath = path.join(DATA_DIR, filename)
if not path.exists(filepath):
    resp = await httpx.get(f"https://www.ourcommons.ca/Members/en/search/csv?parliament={latest_parliament}&caucusId=all&province=all&gender=all")
    resp.raise_for_status()
    with open(filepath, "wb") as f:
        f.write(resp.content)
    print(f"Downloaded {filename}")

Downloaded members-44.csv


### Insert into sqlite

In [32]:
members = pd.read_csv(filepath)
members["Start Date"] = members["Start Date"].apply(parse_parl_datetime)
members["End Date"] = members["End Date"].apply(parse_parl_datetime)
members["Member ID"] = members.apply(lambda row: f"{row['First Name']}-{row["Last Name"]}-{row["Constituency"]}", axis=1)

with db_connect() as db:
    members.to_sql(MEMBERS_TABLE, db, if_exists="replace", index=False)
    db.execute(f"CREATE UNIQUE INDEX IF NOT EXISTS idx_member_id ON {MEMBERS_TABLE} ([Member ID])")
    print(f"Inserted {len(members)} members into {MEMBERS_TABLE} table.")
    
print_table_schema(MEMBERS_TABLE)

Inserted 349 members into members table.
Honorific Title: TEXT
First Name: TEXT
Last Name: TEXT
Constituency: TEXT
Province / Territory: TEXT
Political Affiliation: TEXT
Start Date: TIMESTAMP
End Date: TIMESTAMP
Member ID: TEXT


In [None]:
def find_member_id(full_member_name: str) -> Optional[str]:
    """Find a member ID from their full name (e.g. Mr. Justin Trudeau (Papineau)). 
    Really flakey matching but if it works it works."""
    honorific, member_name = member_name.split(". ", 1)
    assert len(honorific) > 1
    assert "." not in member_name
    first_name, last_name = member_name.split(" ", 1)
    with db_connect() as db:
        cursor = db.cursor()
        cursor.execute(f"SELECT [Member ID] FROM {MEMBERS_TABLE} WHERE [First Name] = ? AND [Last Name] = ?", (first_name, last_name))
        rows = cursor.fetchall()
    assert len(rows) == 1
    return rows[0][0]

In [None]:
find_member_id()

## Votes held across all parliamentary sessions

### Pull files as csvs

In [11]:
os.makedirs(path.join(DATA_DIR, VOTES_HELD_TABLE), exist_ok=True)

for session in PARLIMENTARY_SESSIONS:
    filename = f"votes-{session}.csv"
    filepath = path.join(DATA_DIR, VOTES_HELD_TABLE, filename)
    if path.exists(filepath):
        # print(f"File {filename} already exists, skipping download.")
        continue
    resp = await httpx.get(f"https://www.ourcommons.ca/Members/en/votes/csv?parlSession={session}")
    resp.raise_for_status()
    with open(filepath, "wb") as f:
        f.write(resp.content)
    print(f"Downloaded {filename}")

### Format and insert into the sqlite db

In [13]:
is_first = True

with db_connect() as db:
    for session in PARLIMENTARY_SESSIONS:
        filename = f"votes-{session}.csv"
        filepath = path.join(DATA_DIR, VOTES_HELD_TABLE, filename)
        assert path.exists(filepath), f"File {filename} does not exist"

        v = pd.read_csv(filepath)

        v["Vote Subject"] = v["Vote Subject"].astype("string")
        v["Vote Result"] = v["Vote Result"].astype("string")
        v["Agreed To"] = v["Vote Result"].apply(lambda x: True if x == "Agreed To" else False)
        v["Bill Number"] = v["Bill Number"].astype("string")
        v["Date"] = v["Date"].apply(parse_parl_datetime)
        v["Vote ID"] = v["Parliament"].astype("string") + "-" + v["Session"].astype("string") + "-" + v["Vote Number"].astype("string")

        for c in v.columns:
            assert v[c].dtype != "object", f"Column {c} is still an object type"

        v.to_sql(VOTES_HELD_TABLE, db, if_exists="replace" if is_first else "append", index=False)
        is_first = False

    db.execute(
        f"CREATE UNIQUE INDEX IF NOT EXISTS idx_session_vote_id ON {VOTES_HELD_TABLE} ([Vote ID])"
    )

    print_table_schema(VOTES_HELD_TABLE)

Parliament: INTEGER
Session: INTEGER
Date: TIMESTAMP
Vote Number: INTEGER
Vote Subject: TEXT
Vote Result: TEXT
Yeas: INTEGER
Nays: INTEGER
Paired: INTEGER
Bill Number: TEXT
Agreed To: INTEGER
Vote ID: TEXT


## Who voted for what

### Download member voting data

In [14]:
with db_connect() as db:
    cursor = db.cursor()
    cursor.execute(f"SELECT Parliament, Session, [Vote Number] FROM {VOTES_HELD_TABLE}")
    rows = cursor.fetchall()
    votes_held = [tuple(row) for row in rows]
    print(f"Total number of votes held: {len(votes_held)}")


os.makedirs(path.join(DATA_DIR, MEMBER_VOTES_TABLE), exist_ok=True)

for parliament, session, vote_number in votes_held:
    url = f"https://www.ourcommons.ca/Members/en/votes/{parliament}/{session}/{vote_number}/csv"
    filename = f"member-votes-{parliament}-{session}-{vote_number}.csv"
    filepath = path.join(DATA_DIR, MEMBER_VOTES_TABLE, filename)
    if path.exists(filepath):
        # print(f"File {filename} already exists, skipping download.")
        continue
    resp = await httpx.get(url)
    resp.raise_for_status()
    with open(filepath, "wb") as f:
        f.write(resp.content)
    print(f"Downloaded {filename}")


Total number of votes held: 4678


### Insert member voting data into the sqlite db

In [15]:
with db_connect() as db:
    bill_vote_ids = [row["Vote ID"] for row in db.execute(f"SELECT [Vote ID] FROM {VOTES_HELD_TABLE} WHERE [Bill Number] IS NOT NULL").fetchall()]
    member_vote_rows = []
    for vote_id in bill_vote_ids:
        filename = f"member-votes-{vote_id}.csv"
        filepath = path.join(DATA_DIR, MEMBER_VOTES_TABLE, filename)
        assert path.exists(filepath), f"File {filename} does not exist"
        v = pd.read_csv(filepath)
        print(vote_id)


44-1-923
44-1-922
44-1-921
44-1-909
44-1-904
44-1-864
44-1-863
44-1-862
44-1-861
44-1-859
44-1-855
44-1-853
44-1-852
44-1-851
44-1-849
44-1-848
44-1-847
44-1-846
44-1-845
44-1-844
44-1-843
44-1-842
44-1-841
44-1-840
44-1-839
44-1-838
44-1-837
44-1-836
44-1-835
44-1-834
44-1-833
44-1-832
44-1-831
44-1-829
44-1-827
44-1-826
44-1-825
44-1-824
44-1-823
44-1-822
44-1-820
44-1-819
44-1-818
44-1-814
44-1-813
44-1-812
44-1-811
44-1-809
44-1-806
44-1-805
44-1-802
44-1-801
44-1-800
44-1-799
44-1-795
44-1-794
44-1-792
44-1-791
44-1-790
44-1-789
44-1-788
44-1-787
44-1-785
44-1-784
44-1-783
44-1-782
44-1-781
44-1-780
44-1-779
44-1-778
44-1-777
44-1-776
44-1-774
44-1-772
44-1-767
44-1-766
44-1-765
44-1-764
44-1-763
44-1-761
44-1-759
44-1-758
44-1-756
44-1-753
44-1-752
44-1-751
44-1-750
44-1-745
44-1-741
44-1-740
44-1-738
44-1-737
44-1-736
44-1-735
44-1-734
44-1-733
44-1-732
44-1-731
44-1-730
44-1-729
44-1-728
44-1-727
44-1-726
44-1-725
44-1-724
44-1-723
44-1-722
44-1-721
44-1-720
44-1-719
44-1-718
4

In [16]:
filename = "member-votes-44-1-922.csv"
filepath = path.join(DATA_DIR, MEMBER_VOTES_TABLE, filename)
v = pd.read_csv(filepath)
print(v.columns)

Index(['Member of Parliament', 'Political Affiliation', 'Member Voted',
       'Paired'],
      dtype='object')


In [17]:
is_first = True

with db_connect() as db:
    for session in PARLIMENTARY_SESSIONS:
        

        filename = f"votes-{session}.csv"
        filepath = path.join(DATA_DIR, VOTES_HELD_TABLE, filename)
        assert path.exists(filepath), f"File {filename} does not exist"

        v = pd.read_csv(filepath)

        v["Vote Subject"] = v["Vote Subject"].astype("string")
        v["Vote Result"] = v["Vote Result"].astype("string")
        v["Agreed To"] = v["Vote Result"].apply(lambda x: True if x == "Agreed To" else False)
        v["Bill Number"] = v["Bill Number"].astype("string")
        v["Date"] = v["Date"].apply(parse_parl_datetime)

        for c in v.columns:
            assert v[c].dtype != "object", f"Column {c} is still an object type"

        v.to_sql(VOTES_HELD_TABLE, db, if_exists="replace" if is_first else "append", index=False)
        is_first = False

    db.execute(
        f"CREATE UNIQUE INDEX IF NOT EXISTS idx_session_vote_id ON {VOTES_HELD_TABLE} (Parliament, Session, [Vote Number])"
    )

    print_table_schema(VOTES_HELD_TABLE)

Parliament: INTEGER
Session: INTEGER
Date: TIMESTAMP
Vote Number: INTEGER
Vote Subject: TEXT
Vote Result: TEXT
Yeas: INTEGER
Nays: INTEGER
Paired: INTEGER
Bill Number: TEXT
Agreed To: INTEGER


## Optimize the sqlite db

In [18]:
with db_connect() as db:
    db.execute("VACUUM")
    db.execute("PRAGMA optimize")
    db.execute("ANALYZE")