In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

from nltk.stem import WordNetLemmatizer 

from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [10]:
data_dir = "Model Training/"
target_col = 'transition_value'
training_output_filename = data_dir  + "training_output_binary_filename.csv"

In [11]:
# split dataset evenly based on labels
def split_test_train(total, stratify_col):
    transition_rows = total[total[stratify_col] != 0]
    non_transition_rows = total[total[stratify_col] == 0]
    
    # first split transitions into training/testing
    X_train1, X_test1, y_train1, y_test1 = train_test_split(transition_rows, 
                                                    transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # assert there are only transition labels in this dataframe
    assert len(X_train1[X_train1[target_col] == 0]) == 0
    assert len(X_test1[X_test1[target_col] == 0]) == 0
    
    train_len = len(X_train1) # number of non-transitions to add to training set
    test_len = len(X_test1) # number of non-transitions to add to testing set
    print(train_len)
    
    # next split non-transitions into training/testing
    X_train2, X_test2, y_train2, y_test2 = train_test_split(non_transition_rows, 
                                                    non_transition_rows[target_col], 
                                                    test_size=0.30, random_state=42)
    
    # pick train_len random rows from non-transition training set
    ###change n = train_len
   
    X_train2 = X_train2.sample(n = train_len, axis=0)
    
    # pick test_len random rows from non_transitions testing set
    X_test2 = X_test2.sample(n = test_len, axis=0)
    
    # assert there are no transition utterances in non-transition training and testing set
    assert len(X_train2[X_train2[target_col] != 0]) == 0
    assert len(X_test2[X_test2[target_col] != 0]) == 0
    
    # final result, concat the dataframe
    X_train_final = pd.concat([X_train1, X_train2])
    X_test_final = pd.concat([X_test1, X_test2])
    return X_train_final['text'], X_test_final['text'], X_train_final[target_col], X_test_final[target_col]
    

In [14]:
train = pd.read_table(training_output_filename, sep="~")[['text', target_col]]
x_train, x_test, y_train, y_test = split_test_train(train, target_col)
ids = train['text']
train['text'] = train['video_id']
train['video_id'] = ids

2008


KeyError: 'video_id'

## Tokenize and pad word len

In [None]:
tokenizer = Tokenizer(num_words=40000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

padded = pad_sequences(sequences, maxlen = 44)


In [None]:
pred = to_categorical(y_train)

## Model

In [None]:
model = Sequential()
model.add(Embedding(40000, 150, input_length=44))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.5))
model.add(Dense(2, activation='sigmoid')) #fully connected layer
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


In [None]:

filepath="weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

model.fit(padded, pred,validation_split=0.5, epochs = 50, callbacks = callbacks_list)


## testing 


In [None]:
sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(sequences, maxlen = 44)
print(test_padded.shape)

In [None]:
model.load_weights("weights.best.hdf5")
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


predictions = model.predict(test_padded)
predictions = np.argmax(predictions, axis =1)
accuracy_score(predictions,y_test)

In [None]:
wrong = 0
missed_transition = 0
for i in range(len(predictions)):
    if predictions[i] != y_test.iloc[i]:
        wrong= wrong+1
        print(predictions[i],y_test.iloc[i],x_test.iloc[i])
        if predictions[i] == 0:
            missed_transition = missed_transition +1 