In [36]:
import os
import re
import pandas as pd
import numpy as np

In [18]:
CONVOTE_DATA_DIR = "../convote_v1.1/data_stage_one/"
MODE = "training"  # or "test" or "development"

In [41]:
def parse_convote_data(base_path):
    file_names = os.listdir(base_path)
    data = []
    for fn in file_names:
        # Get metadata; https://www.cs.cornell.edu/home/llee/data/convote/README.v1.1.txt
        m = re.match(r"(?P<bill>\d\d\d)_(?P<speaker>\d\d\d\d\d\d)_"
                     + r"(?P<page_num>\d\d\d\d)(?P<speech_num>\d\d\d)_"
                     + r"(?P<party>\w)(?P<mentioned>\w)(?P<vote>\w)\.txt", fn)
        bill = int(m.group("bill"))
        speaker = int(m.group("speaker"))
        bill_directly_mentioned = m.group("mentioned")
        vote = m.group("vote")

        # Get text
        with open(base_path + fn) as f:
            text = f.read()

        # Save to dict
        data.append([bill, speaker, bill_directly_mentioned, text, vote])  # base features

    df = pd.DataFrame(np.array(data), columns=["Bill number", "Speaker", "Bill mentioned", "Text", "Vote"])
    return df

In [43]:
base_path = CONVOTE_DATA_DIR + MODE + "_set/"
df = parse_convote_data(base_path)

In [44]:
print(len(df))

5660


In [45]:
# Per-debate stats
unique_bills = np.unique(df["Bill number"])
print("Number of debates:", len(unique_bills))
for ub in unique_bills[:3]:
    print("Bill #:", ub)
    speeches = df[df["Bill number"] == ub]
    print("# of speeches:", len(speeches))
    unique_speakers = np.unique(speeches["Speaker"])
    print("# of speakers:", len(unique_speakers))

Number of debates: 38
Bill #: 108
# of speeches: 95
# of speakers: 18
Bill #: 13
# of speeches: 72
# of speakers: 10
Bill #: 132
# of speeches: 752
# of speakers: 164


In [46]:
class Model:
    def __init__(self):
        pass
    
    def train(self, df : pd.DataFrame):
        raise NotImplementedError()

    def predict_votes(self, df : pd.DataFrame):
        raise NotImplementedError()

In [48]:
class Baseline(Model):
    pass

In [49]:
MODE = "development"  # can change to test
base_path = CONVOTE_DATA_DIR + MODE + "_set/"
testing_df = parse_convote_data(base_path)

In [None]:
model = new Model()
model.train(df)
model.predict