In [59]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [60]:
data_dir = "/Users/brucerowan/Documents/capstone/DigitalDemocracyCapstone/"
target_col = 'transition_value'

In [61]:
training_output_filename = data_dir  + "training_utterances_n_range_collapsed.csv"


In [62]:
# split dataset evenly based on labels
def split_test_train(total, stratify_col):
    transition_rows = total[total[stratify_col] != 0]
    non_transition_rows = total[total[stratify_col] == 0]
    
    # first split transitions into training/testing
    X_train1, X_test1, y_train1, y_test1 = train_test_split(transition_rows, 
                                                    transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # assert there are only transition labels in this dataframe
    assert len(X_train1[X_train1[target_col] == 0]) == 0
    assert len(X_test1[X_test1[target_col] == 0]) == 0
    
    train_len = len(X_train1) # number of non-transitions to add to training set
    test_len = len(X_test1) # number of non-transitions to add to testing set
    
    
    # next split non-transitions into training/testing
    X_train2, X_test2, y_train2, y_test2 = train_test_split(non_transition_rows, 
                                                    non_transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # pick train_len random rows from non-transition training set
    ###change n = train_len
    print(train_len)
    X_train2 = X_train2.sample(n = train_len*4, axis=0)
    
    # pick test_len random rows from non_transitions testing set
    X_test2 = X_test2.sample(n = test_len, axis=0)
    
    # assert there are no transition utterances in non-transition training and testing set
    assert len(X_train2[X_train2[target_col] != 0]) == 0
    assert len(X_test2[X_test2[target_col] != 0]) == 0
    
    # final result, concat the dataframe
    X_train_final = pd.concat([X_train1, X_train2])
    X_test_final = pd.concat([X_test1, X_test2])
    return X_train_final['text'], X_test_final['text'], X_train_final[target_col], X_test_final[target_col]
    

In [63]:
data = pd.read_table(training_output_filename, sep = "~")

In [64]:
train = pd.read_table(training_output_filename, sep="~")[['text', target_col]]

In [65]:
#make all 2's into 1's
train.loc[train['transition_value'] > 0, 'transition_value'] = 1
train.columns

Index(['text', 'transition_value'], dtype='object')

In [66]:
x_train, x_test, y_train, y_test = split_test_train(train, target_col)

2044


In [91]:
x_train

48865                                                    1 
303181                                                   1 
582147                                                 1 1 
259517                                                   1 
12024                                                    1 
309185                                                   1 
116326                                                 1 1 
141389                                                   1 
81597                                                    1 
371396                                                   1 
30038                                                  1 1 
307931                                                   1 
32113                                                1 1 1 
345028                                                 1 1 
602493                                                   1 
643778                                                   1 
42904                                   

In [92]:
# there are 5226 transition phrases and 5226*6 = 31356 non transition phrases
# 36582 total

In [93]:
transition_rows = train[train[target_col] != 0]


### Assert training and testing splits are the correct dimensions
### After splitting, training and testing sets should each have 50% transitions and 50% non-transitions
### training dimensions should be 2 * 70% of the number of transitions in the data set
### testing dimensions should be 2 * 30% of the number of transitions in the data set

In [94]:
assert len(x_train) == len(y_train)

In [95]:
assert len(x_test) == len(y_test)

In [96]:
assert len(x_train) == int(len(transition_rows) * 0.7) * 2

AssertionError: 

In [None]:
assert len(x_test) == (len(transition_rows) * 2) - (int(len(transition_rows) * 0.7) * 2)

In [97]:
assert len(y_train[y_train == 0]) == len(y_train[y_train != 0])

AssertionError: 

In [98]:
assert len(y_test[y_test == 0]) == len(y_test[y_test != 0])

### Vectorize utterances with bag of words features

In [99]:
count_vect = CountVectorizer()
count_vect.fit(np.hstack((x_train, x_test)))
X_train_counts = count_vect.transform(x_train)
X_test_counts = count_vect.transform(x_test)

In [100]:
assert X_train_counts.shape[1] == X_test_counts.shape[1]

### Pass vectorized utterances into a Naive Bayes model

In [101]:
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Output accuracy on testing set

In [102]:
assert X_test_counts.shape[0] == y_test.shape[0]

In [103]:
clf.score(X_test_counts, y_test, sample_weight=None)

0.4908779931584949

## keras

In [104]:
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## tokenize words

In [105]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

In [106]:
#so all utterences are same length
padded = pad_sequences(sequences, maxlen = 44)
print(padded.shape)

(10220, 44)


In [107]:
pred = to_categorical(y_train)
pred


array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       ..., 
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

In [108]:
model = Sequential()
model.add(Embedding(20000, 150, input_length=44))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.5))
model.add(Dense(25,activation = 'sigmoid'))
model.add(Dense(2, activation='sigmoid')) #fully connected layer
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])



In [109]:
#class weights 
class_weight = {
    0 : 1.,
    1: 4.,
    }

In [110]:
model.fit(padded, pred, epochs = 1,class_weight = class_weight)


Epoch 1/1
  640/10220 [>.............................] - ETA: 1:08 - loss: 0.9490 - acc: 0.6992

KeyboardInterrupt: 

## testing 


In [111]:
tokenizer.fit_on_texts(x_test)
sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(sequences, maxlen = 44)
print(test_padded.shape)

(1754, 44)


In [112]:
predictions = model.predict(test_padded)
predictions = np.argmax(predictions, axis =1)
print(len(predictions))

1754


In [113]:
accuracy_score(predictions,y_test)


0.86031927023945265

In [117]:
print(x_test.head(100))


423002      1 
379313      1 
457374      1 
34923       1 
567062      1 
560620      1 
158522      1 
451998      1 
560860      1 
345489      1 
288509      1 
65833       1 
283041      1 
573038      1 
22864       1 
121702    1 1 
387985    1 1 
148984      1 
190710      1 
68107       1 
359052      1 
261872      1 
359109      1 
68198       1 
68518       1 
272029      1 
98234     1 1 
40679     1 1 
44579       1 
621290      1 
          ... 
120465      1 
200630      1 
394745      1 
646888      1 
562062      1 
9267        1 
119185      1 
592359      1 
590931      1 
439758      1 
147533      1 
619218    1 1 
33183       1 
568309      1 
372555      1 
97352       1 
636649      1 
371350      1 
287380      1 
304555      1 
286125      1 
493307    1 1 
356875      1 
50609       1 
291410      1 
400873      1 
435686      1 
279151      1 
339481      1 
67150       1 
Name: text, dtype: object


In [116]:
wrong = 0
missed_transition = 0
for i in range(len(predictions)):
    if predictions[i] != y_test.iloc[i]:
        wrong= wrong+1
        print(predictions[i],y_test.iloc[i],x_test.iloc[i])
        if predictions[i] == 0:
            missed_transition = missed_transition +1 

0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 1 1 
0 1 

In [90]:
print(wrong)
print(missed_transition)
print(missed_transition/wrong)


877
877
1.0


## Notes
3 epochs seems to be the sweet spot

(using 6 times more non transitions)

model 1: 1 epochs, adam optimzer, class weights(0:1,1:6)
accuracy score: 58.06%, (61% guessed 0 when correct was 1)

model 1: 4 epochs, adam optimzer, class weights(0:1,1:6)
accuracy score: 53.46%, (72% guessed 0 when correct was 1)

model 1: 10 epochs, adam optimzer, class weights (0:1,1:6)
accuracy score: 53.31%, (61% guessed 0 when correct was 1)

model 1: 15 epochs, adam optimzer, class weights (0:1,1:6)
accuracy score: 53.15%, (71% guessed 0 when correct was 1)

model 1: 20 epochs, adam optimzer, class weights (0:1,1:6)
accuracy score: 51.36%, (69% guessed 0 when correct was 1)

model 1: 25 epochs, adam optimzer, class weights (0:1,1:6)
accuracy score: 52.83%, (67% guessed 0 when correct was 1)

model 1: 30 epochs, adam optimzer, class weights (0:1,1:6)
accuracy score: 51.65%, (73% guessed 0 when correct was 1)

model 1: 1 epochs, rmsprop, class weights (0:1,1:6)
accuracy score: 45.5%, (74% guessed 0 when correct was 1)

###model 2 

1 epoch, adam
accuracy score: 51.4%, (67.8% guessed 0 when correct was 1)

4 epochs, adam
accuracy score: 52.0%, (71.8% guessed 0 when correct was 1


### model1 
model = Sequential()
model.add(Embedding(20000, 150, input_length=44))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.5))
model.add(Dense(2, activation='sigmoid')) #fully connected layer
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


### model2 (remove conv layer and pooling layer)
model = Sequential()
model.add(Embedding(20000, 150, input_length=44))

model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.5))
model.add(Dense(2, activation='sigmoid')) #fully connected layer
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



## model 3(model 1 with another dense layer) 59% acc on n_range data
model = Sequential()
model.add(Embedding(20000, 150, input_length=44))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.5))
model.add(Dense(25,activation = 'sigmoid'))
model.add(Dense(2, activation='sigmoid')) #fully connected layer
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
