In [1]:
import os
import re
import pandas as pd
import numpy as np
import sklearn

In [2]:
def parse_convote_data(base_path):
    file_names = os.listdir(base_path)
    data = []
    for fn in file_names:
        # Get metadata; https://www.cs.cornell.edu/home/llee/data/convote/README.v1.1.txt
        m = re.match(r"(?P<bill>\d\d\d)_(?P<speaker>\d\d\d\d\d\d)_"
                     + r"(?P<page_num>\d\d\d\d)(?P<speech_num>\d\d\d)_"
                     + r"(?P<party>\w)(?P<mentioned>\w)(?P<vote>\w)\.txt", fn)
        bill = int(m.group("bill"))
        speaker = int(m.group("speaker"))
        bill_directly_mentioned = m.group("mentioned")
        vote = m.group("vote")
        # Get text
        with open(base_path + fn) as f:
            text = f.read()
        punctuation = 0
        if ("?" in text): 
            punctuation = 1
        if ("!" in text): 
            punctuation = 2

        # Save to dict
        data.append([bill, speaker, bill_directly_mentioned, text, vote, punctuation])  # base features

    df = pd.DataFrame(np.array(data), columns=["Bill number", "Speaker", "Bill mentioned",
                                               "Text", "Vote", "Punctuation"])
    return df

In [3]:
CONVOTE_DATA_DIR = "../convote_v1.1/data_stage_one/"
MODE = "training"  # or "test" or "development"

In [4]:
base_path = CONVOTE_DATA_DIR + MODE + "_set/"
df = parse_convote_data(base_path)

In [5]:
df.head()

Unnamed: 0,Bill number,Speaker,Bill mentioned,Text,Vote,Punctuation
0,282,400436,M,"mr. speaker , i would like to say a word about...",N,0
1,88,400272,O,"mr. speaker , today we have some very clear ch...",N,0
2,38,400080,O,"mr. speaker , i yield myself such time as i ma...",N,0
3,132,400227,O,"mr. chairman , i yield back the balance of my ...",N,0
4,282,400380,O,"madam chairman , will the gentleman yield ? \n",Y,1


In [6]:
print(len(df))

5660


In [7]:
# Per-debate stats
unique_bills = np.unique(df["Bill number"])
print("Number of debates:", len(unique_bills))
for ub in unique_bills[:3]:
    print("Bill #:", ub)
    speeches = df[df["Bill number"] == ub]
    print("# of speeches:", len(speeches))
    unique_speakers = np.unique(speeches["Speaker"])
    print("# of speakers:", len(unique_speakers))

Number of debates: 38
Bill #: 108
# of speeches: 95
# of speakers: 18
Bill #: 13
# of speeches: 72
# of speakers: 10
Bill #: 132
# of speeches: 752
# of speakers: 164


In [8]:
MODE = "development"  # can change to test
new_base_path = CONVOTE_DATA_DIR + MODE + "_set/"
testing_df = parse_convote_data(new_base_path)
testing_df.head()

Unnamed: 0,Bill number,Speaker,Bill mentioned,Text,Vote,Punctuation
0,493,400036,O,"mr. chairman , i yield myself such time as i m...",Y,0
1,493,400321,M,"mr. chairman , i rise in strong support of the...",Y,0
2,199,400322,O,"mr. chairman , i make a point of order against...",Y,0
3,493,400103,O,"mr. chairman , i yield myself such time as i m...",N,0
4,199,400300,O,"mr. speaker , i yield back the balance of my t...",N,0


In [9]:
from Model import *
from Baseline import *
from SimpleNN import *
from SimpleSVM import *

# TODO: add more...

In [None]:
# Train model
model = SimpleNN()
model.train(df)
predicted = model.predict_votes(testing_df.drop("Vote", axis=1))

In [None]:
# Evaluate model
num_corr = 0
num_tot = 0
assert(len(predicted) == len(testing_df["Vote"]))
for i in range(len(predicted)):
    num_tot += 1
    if predicted[i] == testing_df["Vote"][i]: num_corr += 1
print("Accuracy of SimpleNN:", num_corr / num_tot)