In [34]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

from nltk.stem import WordNetLemmatizer 

from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint


In [8]:
data_dir = "/Users/brucerowan/Documents/capstone/DigitalDemocracyCapstone/data/training/"
target_col = 'transition_value'
training_output_filename = data_dir  + "training_utterances_binary.csv"

In [9]:
# split dataset evenly based on labels
def split_test_train(total, stratify_col):
    transition_rows = total[total[stratify_col] != 0]
    non_transition_rows = total[total[stratify_col] == 0]
    
    # first split transitions into training/testing
    X_train1, X_test1, y_train1, y_test1 = train_test_split(transition_rows, 
                                                    transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # assert there are only transition labels in this dataframe
    assert len(X_train1[X_train1[target_col] == 0]) == 0
    assert len(X_test1[X_test1[target_col] == 0]) == 0
    
    train_len = len(X_train1) # number of non-transitions to add to training set
    test_len = len(X_test1) # number of non-transitions to add to testing set
    print(train_len)
    
    # next split non-transitions into training/testing
    X_train2, X_test2, y_train2, y_test2 = train_test_split(non_transition_rows, 
                                                    non_transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # pick train_len random rows from non-transition training set
    ###change n = train_len
   
    X_train2 = X_train2.sample(n = train_len*3, axis=0)
    
    # pick test_len random rows from non_transitions testing set
    X_test2 = X_test2.sample(n = test_len, axis=0)
    
    # assert there are no transition utterances in non-transition training and testing set
    assert len(X_train2[X_train2[target_col] != 0]) == 0
    assert len(X_test2[X_test2[target_col] != 0]) == 0
    
    # final result, concat the dataframe
    X_train_final = pd.concat([X_train1, X_train2])
    X_test_final = pd.concat([X_test1, X_test2])
    return X_train_final['text'], X_test_final['text'], X_train_final[target_col], X_test_final[target_col]
    

In [10]:
train = pd.read_table(training_output_filename, sep="~")[['text', target_col]]
x_train, x_test, y_train, y_test = split_test_train(train, target_col)

4282


## Tokenize and pad word len

In [11]:
tokenizer = Tokenizer(num_words=40000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

padded = pad_sequences(sequences, maxlen = 44)


In [12]:
pred = to_categorical(y_train)

## Model

In [29]:
model = Sequential()
model.add(Embedding(40000, 150, input_length=44))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.5))
model.add(Dense(2, activation='sigmoid')) #fully connected layer
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


In [35]:

filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.fit(padded, pred,validation_split=0.5, epochs = 50, callbacks = callbacks_list)


Train on 8564 samples, validate on 8564 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50


Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x11f3ce4a8>

## testing 


In [36]:
sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(sequences, maxlen = 44)
print(test_padded.shape)

(3672, 44)


In [37]:
model.load_weights("weights.best.hdf5")
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


predictions = model.predict(test_padded)
predictions = np.argmax(predictions, axis =1)
accuracy_score(predictions,y_test)

0.5403050108932462

In [38]:
wrong = 0
missed_transition = 0
for i in range(len(predictions)):
    if predictions[i] != y_test.iloc[i]:
        wrong= wrong+1
        print(predictions[i],y_test.iloc[i],x_test.iloc[i])
        if predictions[i] == 0:
            missed_transition = missed_transition +1 

0 1 okay, we're going to to bring geo to order as a sub committee.
0 1 second, 1160 reduces barriers to injured workers seeking physical and occupational therapy.
0 1 members, we will now take up <BILL_ID> hill without reference to file, the clerk will read.
0 1 and it's important that i note this that each year there are <BILL_ID> children
0 1 thank you very much madam chair and members.
0 1 that's that's a really important component in doing this.
0 1 this committee to order.
0 1 item 9, <BILL_ID> by runner.
0 1 were either born in or spent years in these camps during world war ii and shortly after world war ii.
0 1 aye. >> aye, de leon, fuller.
0 1 we also think that it really will help in hands public trust by insuring that the information that
0 1 they are csu eu employees that are running our common management system.
0 1 aye, lara.
0 1 public schools, asian lunar, new year day.
0 1 there he is.
0 1 we want to hear how people cast their votes, so that's important, right?
0 1 and 

0 1 number real no-win.
0 1 ...i am voting no on this bill and i urge you to searcher conferences and vote knows well.
0 1 and we're here on this item and
0 1 culture-specific celebrations in that particular part of the island.
0 1 counseling, parenting classes, supplies for the babies.
0 1 asian-american mail to anchor at daily national and cable news show where
0 1 move forward to the senate third reading, file item 46.
0 1 thank you mr. chair and members.
0 1 coupled with lpc, they come up with something, we went through a very thoughtful process
0 1 on their own, many cities have proactively adopted ordinances to
0 1 it's not per vehicle.
0 1 assemblymember waldron, you are recognized.
0 1 emotionally i was absent within my body.
0 1 ...want to recognize them i also want to recognize our guests in the rear
0 1 file items 40 through 48, pass and retain.
0 1 file item 70 through 72 pass and retain.
0 1 i've practiced immigration law for over 20 years and immigration law is very compl

0 1 something the current board member can do with one majority vote and ratification if 16 member city councils.
0 1 mr. dodd, your bill, <BILL_ID>
0 1 aye.
0 1 because they just figure well, if we can just continue to put pressure on you, eventually you'll just agree to settle.
0 1 thank you, ms. gonzalez.
0 1 channel the theory chapel is one of california's oldest historical buildings
0 1 has an attendance policy that prohibits the child from attending or
0 1 i do however have a concern as i know a lot of people do that this action would negatively affect the energy stability of southern california.
0 1 file items 12 and 13 pass and retain.
0 1 again item 5180, department of social services,
0 1 jewish culture has been attempted to be suppressed from the bar kokhba revolt 2000 years ago to today.
0 1 i'm a real.
0 1 senator, i-
0 1 we do not need your officials to do so.
0 1 aye 67, noes 0, measure passes.
0 1 and with smoke coming out of the tailpipe, it would create black cloud.
0

0 1 aye, anderson?
0 1 sailor.
0 1 when it comes to the exercising of a constitutionally enumerated right, it's not a question of whether or not you need to.
0 1 do we know if there's been any research that does suggest
0 1 so this bill, [inaudible] streamlining.
0 1 senator hall, you're on here, you're on the list, item eight.
0 1 second
0 1 the next committee, before the floor, as soon as we can get it done, we'll get it done.
0 1 seeing and hearing no further debate, the clerk will open the roll.
0 1 so, any other questions from committee members?
0 1 an aye vote on this very, very ambitious proposal, yet very well thought out,
0 1 clerk will open the roll.
0 1 and i think that it is incumbent upon the judges to
0 1 aye.
0 1 the arguments, and the control of ceqa in a lot of ways.
0 1 there were extensive negotiations between labor and
0 1 anderson?
0 1 again, good afternoon members, mr. chair.
0 1 thank you.
0 1 to do what is required of them in the words of your mighty profit mica

1 0 mr. david chiu, you are recognized.
1 0 and i respectfully request an aye vote from all of you, thank you.
1 0 decision makers are free to question staff findings and recommendations and
1 0 but we have not done a great job of making sure that the drinking water that they drink every day is clean.
1 0 to clarify that there are more efficient means to maintain elephants.
1 0 thousand california kids with negligible cost associated with the transition
1 0 the natural resources committee and file item twenty eight that's
1 0 terms of the california environmental quality act is this delicate balance.
1 0 several other proposals within the next three weeks.
1 0 but i value women's lives.
1 0 respect among the folks who deal with the budget committee and the veteran's committee.
1 0 julian bond, the story of the georgia
1 0 it is not our intent to have this be a siloed group.
1 0 <BILL_ID> by assemblymember wood, an act relating to medical.
1 0 prior to becoming press secretary in the pr