In [None]:
from httpx import AsyncClient
import os
from os import path
import pandas as pd
import sqlite3
from contextlib import contextmanager

In [None]:
PARLIMENTARY_SESSIONS = (
    "44-1",
    "43-2",
    "43-1",
    "42-1",
    "41-2",
    "41-1",
    "40-3",
    "40-2",
    "40-1",
    "39-2",
    "39-1",
    "38-1",
)

DATA_DIR = "backend/data"
EXPORT_DB = "repsheet.sqlite"

VOTES_HELD_TABLE = "votes_held"
MEMBER_VOTES_TABLE = "member_votes"

os.makedirs(DATA_DIR, exist_ok=True)

httpx = AsyncClient()

@contextmanager
def db_connect():
    """Context manager for database connection."""
    db = sqlite3.connect(EXPORT_DB)
    db.row_factory = sqlite3.Row
    try:
        yield db
    finally:
        db.commit()
        db.close()

In [108]:
def print_table_schema(table_name):
    """Print the schema of a given table."""
    with db_connect() as db:
        cursor = db.cursor()
        cursor.execute(f"PRAGMA table_info({table_name})")
        rows = cursor.fetchall()
        for row in rows:
            print(f"{row[1]}: {row[2]}")

## Download all votes across all parliamentary sessions

### Pull files as csvs

In [None]:
os.makedirs(path.join(DATA_DIR, VOTES_HELD_TABLE), exist_ok=True)

for session in PARLIMENTARY_SESSIONS:
    filename = f"votes-{session}.csv"
    filepath = path.join(DATA_DIR, VOTES_HELD_TABLE, filename)
    if path.exists(filepath):
        # print(f"File {filename} already exists, skipping download.")
        continue
    resp = await httpx.get(f"https://www.ourcommons.ca/Members/en/votes/csv?parlSession={session}")
    resp.raise_for_status()
    with open(filepath, "wb") as f:
        f.write(resp.content)
    print(f"Downloaded {filename}")

File votes-44-1.csv already exists, skipping download.
File votes-43-2.csv already exists, skipping download.
File votes-43-1.csv already exists, skipping download.
File votes-42-1.csv already exists, skipping download.
File votes-41-2.csv already exists, skipping download.
File votes-41-1.csv already exists, skipping download.
File votes-40-3.csv already exists, skipping download.
File votes-40-2.csv already exists, skipping download.
File votes-40-1.csv already exists, skipping download.
File votes-39-2.csv already exists, skipping download.
File votes-39-1.csv already exists, skipping download.
File votes-38-1.csv already exists, skipping download.


### Format and insert into the sqlite db

In [None]:
def parse_parl_datetime(date_str: str) -> pd.Timestamp:
    """Parses strings in parliamentary datetime format, e.g. 2024-12-17 3:50:01 p.m."""
    if not date_str:
        return None
    date_str = date_str.replace("p.m.", "PM").replace("a.m.", "AM")
    return pd.to_datetime(date_str, format="%Y-%m-%d %I:%M:%S %p").tz_localize("Canada/Eastern")

In [114]:
is_first = True

with db_connect() as db:
    for session in PARLIMENTARY_SESSIONS:
        filename = f"votes-{session}.csv"
        filepath = path.join(DATA_DIR, VOTES_HELD_TABLE, filename)
        assert path.exists(filepath), f"File {filename} does not exist"

        v = pd.read_csv(filepath)

        v["Vote Subject"] = v["Vote Subject"].astype("string")
        v["Vote Result"] = v["Vote Result"].astype("string")
        v["Agreed To"] = v["Vote Result"].apply(lambda x: True if x == "Agreed To" else False)
        v["Bill Number"] = v["Bill Number"].astype("string")
        v["Date"] = v["Date"].apply(parse_parl_datetime)

        for c in v.columns:
            assert v[c].dtype != "object", f"Column {c} is still an object type"

        v.to_sql(VOTES_HELD_TABLE, db, if_exists="replace" if is_first else "append", index=False)
        is_first = False

    db.execute(
        f"CREATE UNIQUE INDEX IF NOT EXISTS idx_session_vote_id ON {VOTES_HELD_TABLE} (Parliament, Session, [Vote Number])"
    )

    print_table_schema(VOTES_HELD_TABLE)

Parliament: INTEGER
Session: INTEGER
Date: TIMESTAMP
Vote Number: INTEGER
Vote Subject: TEXT
Vote Result: TEXT
Yeas: INTEGER
Nays: INTEGER
Paired: INTEGER
Bill Number: TEXT
Agreed To: INTEGER


## Download who voted for what

In [None]:
with db_connect() as db:
    cursor = db.cursor()
    cursor.execute(f"SELECT Parliament, Session, [Vote Number] FROM {VOTES_HELD_TABLE}")
    rows = cursor.fetchall()
    votes_held = [tuple(row) for row in rows]
    print(f"Total number of votes held: {len(votes_held)}")


os.makedirs(path.join(DATA_DIR, MEMBER_VOTES_TABLE), exist_ok=True)

for parliament, session, vote_number in votes_held:
    url = f"https://www.ourcommons.ca/Members/en/votes/{parliament}/{session}/{vote_number}/csv"
    filename = f"member-votes-{parliament}-{session}-{vote_number}.csv"
    filepath = path.join(DATA_DIR, MEMBER_VOTES_TABLE, filename)
    if path.exists(filepath):
        # print(f"File {filename} already exists, skipping download.")
        continue
    resp = await httpx.get(url)
    resp.raise_for_status()
    with open(filepath, "wb") as f:
        f.write(resp.content)
    print(f"Downloaded {filename}")


Total number of votes held: 4678
File member-votes-38-1-1.csv already exists, skipping download.
File member-votes-38-1-2.csv already exists, skipping download.
File member-votes-38-1-3.csv already exists, skipping download.
File member-votes-38-1-4.csv already exists, skipping download.
File member-votes-38-1-5.csv already exists, skipping download.
File member-votes-38-1-6.csv already exists, skipping download.
File member-votes-38-1-7.csv already exists, skipping download.
File member-votes-38-1-8.csv already exists, skipping download.
File member-votes-38-1-9.csv already exists, skipping download.
File member-votes-38-1-10.csv already exists, skipping download.
File member-votes-38-1-11.csv already exists, skipping download.
File member-votes-38-1-12.csv already exists, skipping download.
File member-votes-38-1-13.csv already exists, skipping download.
File member-votes-38-1-14.csv already exists, skipping download.
File member-votes-38-1-15.csv already exists, skipping download.
F