In [39]:
import os
import re
import pandas as pd
import numpy as np
import sklearn

In [211]:
CONVOTE_DATA_DIR = "../convote_v1.1/data_stage_one/"
MODE = "training"  # or "test" or "development"

In [212]:
def parse_convote_data(base_path):
    file_names = os.listdir(base_path)
    data = []
    for fn in file_names:
        # Get metadata; https://www.cs.cornell.edu/home/llee/data/convote/README.v1.1.txt
        m = re.match(r"(?P<bill>\d\d\d)_(?P<speaker>\d\d\d\d\d\d)_"
                     + r"(?P<page_num>\d\d\d\d)(?P<speech_num>\d\d\d)_"
                     + r"(?P<party>\w)(?P<mentioned>\w)(?P<vote>\w)\.txt", fn)
        bill = int(m.group("bill"))
        speaker = int(m.group("speaker"))
        bill_directly_mentioned = m.group("mentioned")
        vote = m.group("vote")
        punctuation = 0
        # Get text
        with open(base_path + fn) as f:
            text = f.read()
        if ("?" in text or "!" in text): 
            punctuation = 1

        # Save to dict
        data.append([bill, speaker, bill_directly_mentioned, text, vote])  # base features

    df = pd.DataFrame(np.array(data), columns=["Bill number", "Speaker", "Bill mentioned", "Text", "Vote"])
    return df

In [213]:
base_path = CONVOTE_DATA_DIR + MODE + "_set/"
df = parse_convote_data(base_path)

In [214]:
print(len(df))

5660


In [215]:
# Per-debate stats
unique_bills = np.unique(df["Bill number"])
print("Number of debates:", len(unique_bills))
for ub in unique_bills[:3]:
    print("Bill #:", ub)
    speeches = df[df["Bill number"] == ub]
    print("# of speeches:", len(speeches))
    unique_speakers = np.unique(speeches["Speaker"])
    print("# of speakers:", len(unique_speakers))

Number of debates: 38
Bill #: 108
# of speeches: 95
# of speakers: 18
Bill #: 13
# of speeches: 72
# of speakers: 10
Bill #: 132
# of speeches: 752
# of speakers: 164


In [216]:
class Model:
    def __init__(self):
        pass
    
    def train(self, df : pd.DataFrame):
        raise NotImplementedError()

    def predict_votes(self, df : pd.DataFrame):
        raise NotImplementedError()

In [217]:
class Baseline(Model):
    def __init__(self):
        self.vectorizer = None
        self.clf = None
    
    def train(self, df : pd.DataFrame):
        corpus = df["Text"]
        self.vectorizer = TfidfVectorizer()
        X = self.vectorizer.fit_transform(corpus)
        self.clf = MultinomialNB().fit(X, df["Vote"])

    def predict_votes(self, df : pd.DataFrame):
        new_corpus = df["Text"]
        new_X = self.vectorizer.transform(new_corpus)
        predicted = self.clf.predict(new_X)
        return predicted

In [218]:
class SimpleNN(Model):
    def __init__(self):
        self.vectorizer = None
        self.clf = MLPClassifier(hidden_layer_sizes=(50, 10, 2), solver='adam', tol=1e-5)

    def train(self, df : pd.DataFrame):
        corpus = df["Text"]
        self.vectorizer = TfidfVectorizer()
        X = self.vectorizer.fit_transform(corpus)
        self.clf = self.clf.fit(X, df["Vote"])
        print("nn fitted")

    def predict_votes(self, df : pd.DataFrame):
        new_corpus = df["Text"]
        new_X = self.vectorizer.transform(new_corpus)
        predicted = self.clf.predict(new_X)
        return predicted

In [219]:
class SimpleSVM(Model):
    def __init__(self):
        self.vectorizer = None
        self.clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))

    def train(self, df : pd.DataFrame):
        corpus = df["Text"]
        self.vectorizer = TfidfVectorizer()
        X = self.vectorizer.fit_transform(corpus)
        self.clf = self.clf.fit(X, df["Vote"])

    def predict_votes(self, df : pd.DataFrame):
        new_corpus = df["Text"]
        new_X = self.vectorizer.transform(new_corpus)
        predicted = self.clf.predict(new_X)
        return predicted

In [220]:
MODE = "development"  # can change to test
base_path = CONVOTE_DATA_DIR + MODE + "_set/"
testing_df = parse_convote_data(base_path)
testing_df.head()

Unnamed: 0,Bill number,Speaker,Bill mentioned,Text,Vote
0,493,400036,O,"mr. chairman , i yield myself such time as i m...",Y
1,493,400321,M,"mr. chairman , i rise in strong support of the...",Y
2,199,400322,O,"mr. chairman , i make a point of order against...",Y
3,493,400103,O,"mr. chairman , i yield myself such time as i m...",N
4,199,400300,O,"mr. speaker , i yield back the balance of my t...",N


In [221]:
from Model import *
from Baseline import *
from SimpleNN import *
from SimpleSVM import *

# TODO: add more...

In [222]:
# Train model
model = SimpleNN()
model.train(df)
predicted = model.predict_votes(testing_df.drop("Vote", axis=1))

nn fitted




In [224]:
# Evaluate model
num_corr = 0
num_tot = 0
assert(len(predicted) == len(testing_df["Vote"]))
for i in range(len(predicted)):
    num_tot += 1
    if predicted[i] == testing_df["Vote"][i]: num_corr += 1
print("Accuracy of SimpleNN without punctuation:", num_corr / num_tot)

Accuracy of SimpleNN without punctuation: 0.6851851851851852
