In [1]:
import os
import re
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.util import ngrams
from collections import Counter

In [2]:
def parse_convote_data(base_path):
    file_names = os.listdir(base_path)
    data = []
    for fn in file_names:
        # Get metadata; https://www.cs.cornell.edu/home/llee/data/convote/README.v1.1.txt
        m = re.match(r"(?P<bill>\d\d\d)_(?P<speaker>\d\d\d\d\d\d)_"
                     + r"(?P<page_num>\d\d\d\d)(?P<speech_num>\d\d\d)_"
                     + r"(?P<party>\w)(?P<mentioned>\w)(?P<vote>\w)\.txt", fn)
        bill = int(m.group("bill"))
        speaker = int(m.group("speaker"))
        bill_directly_mentioned = m.group("mentioned")
        vote = m.group("vote")
        # Get text
        with open(base_path + fn) as f:
            text = f.read()
        punctuation = 0
        if ("?" in text): 
            punctuation = 1
        if ("!" in text): 
            punctuation = 2
        token = nltk.word_tokenize(text)
        unigrams = Counter(ngrams(token,1)).most_common(10)
        bigrams = Counter(ngrams(token,2)).most_common(10)


        # Save to dict
        data.append([bill, speaker, bill_directly_mentioned, text, vote, punctuation, unigrams, bigrams])  # base features

    df = pd.DataFrame(np.array(data), columns=["Bill number", "Speaker", "Bill mentioned",
                                               "Text", "Vote", "Punctuation", "Unigrams", "Bigrams"])
    sc = StandardScaler()
    df[["Punctuation"]] = sc.fit_transform(df[["Punctuation"]])
    return df

In [3]:
CONVOTE_DATA_DIR = "../convote_v1.1/data_stage_one/"
MODE = "training"  # or "test" or "development"

In [4]:
base_path = CONVOTE_DATA_DIR + MODE + "_set/"
df = parse_convote_data(base_path)

  df = pd.DataFrame(np.array(data), columns=["Bill number", "Speaker", "Bill mentioned",


In [5]:
df.head()

Unnamed: 0,Bill number,Speaker,Bill mentioned,Text,Vote,Punctuation,Unigrams,Bigrams
0,282,400436,M,"mr. speaker , i would like to say a word about...",N,-0.509268,"[((,,), 20), ((the,), 19), ((.,), 19), ((to,),...","[((the, world), 4), ((for, women), 3), ((women..."
1,88,400272,O,"mr. speaker , today we have some very clear ch...",N,-0.509268,"[((.,), 16), ((to,), 15), ((the,), 14), ((,,),...","[((this, bill), 6), ((we, can), 5), ((mr., spe..."
2,38,400080,O,"mr. speaker , i yield myself such time as i ma...",N,-0.509268,"[((the,), 23), ((.,), 15), ((,,), 13), ((to,),...","[((., the), 7), ((,, i), 4), ((civil, rights),..."
3,132,400227,O,"mr. chairman , i yield back the balance of my ...",N,-0.509268,"[((mr.,), 1), ((chairman,), 1), ((,,), 1), ((i...","[((mr., chairman), 1), ((chairman, ,), 1), ((,..."
4,282,400380,O,"madam chairman , will the gentleman yield ? \n",Y,1.85146,"[((madam,), 1), ((chairman,), 1), ((,,), 1), (...","[((madam, chairman), 1), ((chairman, ,), 1), (..."


In [6]:
print(len(df))

5660


In [7]:
# Per-debate stats
unique_bills = np.unique(df["Bill number"])
print("Number of debates:", len(unique_bills))
for ub in unique_bills[:3]:
    print("Bill #:", ub)
    speeches = df[df["Bill number"] == ub]
    print("# of speeches:", len(speeches))
    unique_speakers = np.unique(speeches["Speaker"])
    print("# of speakers:", len(unique_speakers))

Number of debates: 38
Bill #: 6
# of speeches: 70
# of speakers: 28
Bill #: 13
# of speeches: 72
# of speakers: 10
Bill #: 16
# of speeches: 53
# of speakers: 21


In [8]:
MODE = "development"  # can change to test
new_base_path = CONVOTE_DATA_DIR + MODE + "_set/"
testing_df = parse_convote_data(new_base_path)
testing_df.head()

  df = pd.DataFrame(np.array(data), columns=["Bill number", "Speaker", "Bill mentioned",


Unnamed: 0,Bill number,Speaker,Bill mentioned,Text,Vote,Punctuation,Unigrams,Bigrams
0,493,400036,O,"mr. chairman , i yield myself such time as i m...",Y,-0.494715,"[((,,), 13), ((the,), 11), ((.,), 7), ((in,), ...","[((the, 1964), 4), ((1964, civil), 4), ((civil..."
1,493,400321,M,"mr. chairman , i rise in strong support of the...",Y,-0.494715,"[((,,), 18), ((the,), 18), ((and,), 18), ((of,...","[((head, start), 11), ((early, head), 5), ((of..."
2,199,400322,O,"mr. chairman , i make a point of order against...",Y,-0.494715,"[((the,), 11), ((.,), 8), ((of,), 6), ((to,), ...","[((the, amendment), 3), ((point, of), 2), ((of..."
3,493,400103,O,"mr. chairman , i yield myself such time as i m...",N,-0.494715,"[((,,), 23), ((.,), 21), ((to,), 18), ((the,),...","[((head, start), 13), ((mr., chairman), 4), ((..."
4,199,400300,O,"mr. speaker , i yield back the balance of my t...",N,-0.494715,"[((mr.,), 1), ((speaker,), 1), ((,,), 1), ((i,...","[((mr., speaker), 1), ((speaker, ,), 1), ((,, ..."


In [13]:
from Model import *
from Baseline import *
from SimpleNN import *
from SimpleSVM import *
from bert import *

# TODO: add more...

In [14]:
# Train model
model = Bert()
model.train(df)
predicted = model.predict_votes(testing_df.drop("Vote", axis=1))

TypeError: transform_df() takes 1 positional argument but 2 were given

In [15]:
df.head()

Unnamed: 0,Bill number,Speaker,Bill mentioned,Text,Vote,Punctuation,Unigrams,Bigrams
0,282,400436,M,"mr. speaker , i would like to say a word about...",N,-0.509268,"[((,,), 20), ((the,), 19), ((.,), 19), ((to,),...","[((the, world), 4), ((for, women), 3), ((women..."
1,88,400272,O,"mr. speaker , today we have some very clear ch...",N,-0.509268,"[((.,), 16), ((to,), 15), ((the,), 14), ((,,),...","[((this, bill), 6), ((we, can), 5), ((mr., spe..."
2,38,400080,O,"mr. speaker , i yield myself such time as i ma...",N,-0.509268,"[((the,), 23), ((.,), 15), ((,,), 13), ((to,),...","[((., the), 7), ((,, i), 4), ((civil, rights),..."
3,132,400227,O,"mr. chairman , i yield back the balance of my ...",N,-0.509268,"[((mr.,), 1), ((chairman,), 1), ((,,), 1), ((i...","[((mr., chairman), 1), ((chairman, ,), 1), ((,..."
4,282,400380,O,"madam chairman , will the gentleman yield ? \n",Y,1.85146,"[((madam,), 1), ((chairman,), 1), ((,,), 1), (...","[((madam, chairman), 1), ((chairman, ,), 1), (..."


In [None]:
# Evaluate model
num_corr = 0
num_tot = 0
assert(len(predicted) == len(testing_df["Vote"]))
for i in range(len(predicted)):
    num_tot += 1
    if predicted[i] == testing_df["Vote"][i]: num_corr += 1
print("Accuracy of SimpleNN:", num_corr / num_tot)