In [14]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [15]:
data_dir = "/Users/brucerowan/Documents/capstone/DigitalDemocracyCapstone/"
target_col = 'utterance_transition_values'

In [26]:
training_output_filename = data_dir  + "training_utterances.csv"

In [27]:
# split dataset evenly based on labels
def split_test_train(total, stratify_col):
    transition_rows = total[total[stratify_col] != 0]
    non_transition_rows = total[total[stratify_col] == 0]
    
    # first split transitions into training/testing
    X_train1, X_test1, y_train1, y_test1 = train_test_split(transition_rows, 
                                                    transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # assert there are only transition labels in this dataframe
    assert len(X_train1[X_train1[target_col] == 0]) == 0
    assert len(X_test1[X_test1[target_col] == 0]) == 0
    
    train_len = len(X_train1) # number of non-transitions to add to training set
    test_len = len(X_test1) # number of non-transitions to add to testing set
    
    
    # next split non-transitions into training/testing
    X_train2, X_test2, y_train2, y_test2 = train_test_split(non_transition_rows, 
                                                    non_transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # pick train_len random rows from non-transition training set
    X_train2 = X_train2.sample(n = train_len, axis=0)
    
    # pick test_len random rows from non_transitions testing set
    X_test2 = X_test2.sample(n = test_len, axis=0)
    
    # assert there are no transition utterances in non-transition training and testing set
    assert len(X_train2[X_train2[target_col] != 0]) == 0
    assert len(X_test2[X_test2[target_col] != 0]) == 0
    
    # final result, concat the dataframe
    X_train_final = pd.concat([X_train1, X_train2])
    X_test_final = pd.concat([X_test1, X_test2])
    
    return X_train_final['text'], X_test_final['text'], X_train_final[target_col], X_test_final[target_col]
    

In [28]:
train = pd.read_table(training_output_filename, sep="~")[['text', target_col]]

In [29]:
train.head()

Unnamed: 0,text,utterance_transition_values
0,We don't have a quorum yet I don't believe.,1
1,We don't have a quorum yet.,0
2,We'll ask the sergeants to please call the mem...,0
3,that we can establish a quorum for this partic...,0
4,This is the Assembly's 2nd Extraordinary Sessi...,0


In [30]:
x_train, x_test, y_train, y_test = split_test_train(train, target_col)

In [31]:
transition_rows = train[train[target_col] != 0]

### Assert training and testing splits are the correct dimensions
### After splitting, training and testing sets should each have 50% transitions and 50% non-transitions
### training dimensions should be 2 * 70% of the number of transitions in the data set
### testing dimensions should be 2 * 30% of the number of transitions in the data set

In [32]:
assert len(x_train) == len(y_train)

In [33]:
assert len(x_test) == len(y_test)

In [34]:
assert len(x_train) == int(len(transition_rows) * 0.7) * 2

In [35]:
assert len(x_test) == (len(transition_rows) * 2) - (int(len(transition_rows) * 0.7) * 2)

In [36]:
assert len(y_train[y_train == 0]) == len(y_train[y_train != 0])

In [37]:
assert len(y_test[y_test == 0]) == len(y_test[y_test != 0])

### Vectorize utterances with bag of words features

In [38]:
count_vect = CountVectorizer()
count_vect.fit(np.hstack((x_train, x_test)))
X_train_counts = count_vect.transform(x_train)
X_test_counts = count_vect.transform(x_test)

In [39]:
assert X_train_counts.shape[1] == X_test_counts.shape[1]

### Pass vectorized utterances into a Naive Bayes model

In [40]:
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Output accuracy on testing set

In [41]:
assert X_test_counts.shape[0] == y_test.shape[0]

In [42]:
clf.score(X_test_counts, y_test, sample_weight=None)

0.7466517857142857

## keras

In [49]:
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.
  return f(*args, **kwds)


In [61]:
train['utterance_transition_values'].unique()

array([1, 0, 2])

In [73]:
x_train.head()

149011                     Senate Bill 304 by Senator Lara.
214753    Assembly Bill 44 by Assembly member Mullen, an...
139586    File item 75, 76, and 77 we will pass retain, ...
164930    Assembly concurrent resolution 101 by Assembly...
13540     File arabs seventy nine and eighty pass and re...
Name: text, dtype: object

## tokenize words

In [111]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

In [112]:
#so all utterences are same length
padded = pad_sequences(sequences, maxlen = 44)
print(padded.shape)

(10452, 44)


In [113]:
pred = to_categorical(y_train)
pred


array([[ 0.,  1.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  1.,  0.],
       ..., 
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.]])

In [114]:
model = Sequential()
model.add(Embedding(20000, 150, input_length=44))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax')) #fully connected layer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [115]:
model.fit(padded, pred, validation_split = 0.3, epochs = 6)

Train on 7316 samples, validate on 3136 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x154500080>

## testing 


In [116]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(x_test)
sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(sequences, maxlen = 44)
print(test_padded.shape)

(4480, 44)


In [126]:
predictions = model.predict(test_padded)
predictions = np.argmax(predictions, axis =1)
print(predictions)

[1 1 0 ..., 1 0 2]


In [154]:
for i in range(len(predictions)):
    if predictions[i] != y_test.iloc[i]:
        print(predictions[i],y_test.iloc[i],x_test.iloc[i])

1 2 We will now begin our special ceremony to commemorate LGBT Pride Month.
0 1 [INAUDIBLE] Great.
0 1 Yes please begin.
0 1 date of California's determinant sentencing statutes until January 1,
2 1 File item 35, Senate Concurrent Resolution 47.
1 2 File item 111, AB 2546, clerk will read.
2 1 We're gonna pass the retained file items 281 and 282, moving to file item 283, AB 1269.
0 1 We want to hear how people cast their votes, so that's important, right?
2 1 We're good, thank you.
1 2 The clerk will read.
0 1 we're going to be addressing 1965.
1 2 Senate amendments are concurred.
0 1 Assembly Bill 885 by Assembly Member Lopez an act relating to foster youth.
0 2 Ayes 69, nos 0, the measure passes.
0 2 The consent calendar, AB 685 and AB 2331, please call the absent members.
1 2 Garcia expands permissible activities of the Commission for Economic Development Holding Committee.
1 2 Circuitry please...senate bill for fifty-six
0 1 I see Miss Gonzalez is here.
2 1 Yes, good morning Madam 

1 2 Thank you Mr. Chairman, colleagues.
0 2 Members, now we're going to move in file order to hear the items that members have waived presentation,
0 2 and give that provider what is appropriate and fair under those circumstances.
0 1 Senate Bill 1323 by Senator Bates, an act relating to controlled substances.
0 1 Yeah, we do have a quorum so we'll go ahead and call quorum.
0 1 All right, Mr. Obernolte.
0 1 And I think, I would really encourage you to move forward.
0 1 Senate Bill 823 by Senator Block, an act lead to Human Trafficking.
0 1 that both sides dislike the most and we've arrived
1 2 Moving to File Item 33, that's AJR 13.
0 1 As a subcommittee, today we have nine items before
0 2 Senator Monning, file item 126, Mr. Secretary please read.
1 2 Thank you, Mr. Chair and senators.
0 1 I discussed with pro Tem de Leon when he came in to see Ms.
0 1 The clerk will closer ...tie the votes I seventy two zero measure
0 1 Mister Harper, come on up.
0 1 Yes, thank you, Mr. Chair, members

1 0 2091. >> 2091.
1 0 I strongly oppose this bill, no.
1 0 Mr. Speaker, members, SB 399 will give the city of Los Angeles the ability to grant franchises,
1 0 some guidelines to be able to get to the departments so that they can upgrade more efficiently.
1 0 Another daily ...fuller gains.
1 0 So this is more, probably most of these students,
2 0 ...reason to object to this particular provision let me just say
1 0 I'm opposed to it for two reasons.
1 0 In spite of only receiving approximately 2% of the state's entire higher education budget.
1 0 Mr. Chiu, you may close.
1 0 A study in 2010 by the CPUC found that 10% of urban water, 875,000 acre-feet,
1 0 Senator Vidak.
2 0 So, our experience post that decision was that rates reimbursed to emergency physicians have declined.
1 0 But we feel that the nexus is so strong that this is an appropriate resource.
1 0 There is great flexibility at the local level right now.
1 0 It is worth noting that this bill only seeks to address the contents

1 0 Let me have a look.
1 0 Chair, if I may, the nonprofit industry is
1 0 their own constituents residence ...voters over long periods of time meant
1 0 And how thankful we for their fantastic work.
1 0 I'm Nancy Bargeman, I am director of the Department of the Developmental Services and today I have with me John Doyle the chief Deputy Director and
1 0 Mullin?
1 0 president to work to try to come to a solution so we can get past.
1 0 2781, 2790, 2794, 2822,
1 0 Thank you very much.
1 0 File Item 231 AB 2835.
1 0 We will go through this.
1 0 Jackson?
1 0 So, there are reasons.
2 0 Mr. Secretary, please call the roll.
1 0 Moving on to SB1215.
1 0 So, with that said and speaking as a former board member, the American Red Cross of Santa Monica,
1 0 an authorization sentence essentially tying back here to say the county's authorized to report to the court,
1 0 It imposes another burden on small businesses operating in the coast.
1 0 Madam Secretary.
1 0 save many Californians' lives with t

1 0 In her time here in Sacramento, Jenny reported on the state energy crisis,
1 0 a sergeant that was in charge of the pawn detail for the Los Angeles County Sheriff's Department.
2 0 I think the intent of it more has been a phone call, but-
1 0 Assembly Bill 3, by Assemblymember Frazier and others, an act relating to transportation.
1 0 that no one made a big deal out of beating us you guys are such a class act
1 0 Unfortunately, this bill doesn't help us.
1 0 I'd like to see it really more localized in the community,
1 0 And we did this for charitable raffles.
1 0 Motion is to measure be referred to appropriations committee.
1 0 that they are honestly being considered as a viable placement.
1 0 Current disclosure of participation in this and other outside boards is insufficient,
2 0 And it goes to what Senator Pavley mentioned about, we're middle class.
1 0 trailer bill is 10 million one time funding.
1 0 that the courts are trying to utilize to address this very problem that this m

## Notes

model with mse and RMSprop 1 epoch got 98% "accuracy"? (no validation set used)
model with binary_crossentropy, RMSprop and 2 epochs got "validation accuracy" 