In [1]:
%load_ext autoreload
%autoreload 2

# Question answering demo

### Loading the data

In [2]:
from tf_nlp.data  import qa_qc
from tf_nlp.utils import validation_split, find_common_examples

In [3]:
train_and_validate, test = qa_qc("data/input4.txt", "data/test.txt")

print("Train+Validate set size: ", len(train_and_validate))
print("Test  set size: ", len(test))
print("Example piece of data", train_and_validate[0])
print("Intriguing properties: ",)
print("       a) quite a few duplicates detected: ", len(train_and_validate) - len(set(train_and_validate)), )
common = list(find_common_examples(train_and_validate, test))
print("       b) Number of examples shared between train and test: ", len(common))
print("          e.g: ", common[0])

Train+Validate set size:  5452
Test  set size:  500
Example piece of data QaQcDatum(question=('How', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'Russia', '?'), main_cat='DESC', sub_cat='manner')
Intriguing properties: 
       a) quite a few duplicates detected:  70
       b) Number of examples shared between train and test:  10
          e.g:  QaQcDatum(question=('What', 'is', 'viscosity', '?'), main_cat='DESC', sub_cat='def')


In [4]:
# deduplication and validation split
train_and_validate = list(set(train_and_validate))
train, validate = validation_split(train_and_validate, 0.1)

##### Building the Vocab

In [5]:
from tf_nlp       import Vocab 
from tf_nlp.utils import flatten

In [6]:
def extract_questions(dataset):
    return [example.question for example in dataset]

In [7]:
naive_vocab = Vocab(extract_questions(train))
print("Percentage of unknown words in train: ", 100.0 * naive_vocab.fraction_unknown(extract_questions(train)))
print("Percentage of unknown words in validate: ", 100.0 * naive_vocab.fraction_unknown(extract_questions(validate)))

Percentage of unknown words in train:  0.0
Percentage of unknown words in validate:  11.575221238938054


In [8]:
popular_words = Vocab.keep_n_most_frequent(flatten(extract_questions(train)), round(0.9 * len(naive_vocab)))
vocab = Vocab(popular_words)
print("Percentage of unknown words in train: ", 100.0 * vocab.fraction_unknown(extract_questions(train)))
print("Percentage of unknown words in validate: ", 100.0 * vocab.fraction_unknown(extract_questions(validate)))

Percentage of unknown words in train:  1.7854244440837528
Percentage of unknown words in validate:  12.283185840707965


### Building the model

Bidirection stacked GRU! 

In [9]:
import numpy as np
import random
import tensorflow as tf

from tf_nlp.glove import load_glove
from tf_nlp.models import Linear, StackedGRU, Embedding, BidirectionalSentenceParser, Classifier
from tf_nlp.utils  import get_pb, make_batches

In [28]:
if 'session' in globals():
    session.close()
tf.reset_default_graph()
session = tf.InteractiveSession()

In [29]:
MAIN_CATEGORIES = list(set(ex.main_cat for ex in train_and_validate))
SUB_CATEGORIES  = list(set(ex.sub_cat  for ex in train_and_validate))

HIDDEN_SIZES    = [40, 40]
BATCH_SIZE     = 5
USE_GLOVE = False
if not USE_GLOVE:
    EMBEDDING_SIZE = 20
    VOCAB_SIZE = len(vocab)
else:
    vocab, embedding = load_glove("/home/sidor/projects/dali/Dali/data/glove/glove.6B.300d.txt", max_words=100000)
    EMBEDDING_SIZE = embedding.embedding_size 
    VOCAB_SIZE     = embedding.vocab_size

In [30]:
# Define model componnents
if not USE_GLOVE:
    embedding         = Embedding(VOCAB_SIZE, EMBEDDING_SIZE)
gru_cell_forward  = StackedGRU([EMBEDDING_SIZE,], HIDDEN_SIZES, final_nonlinearity=tf.nn.relu6, scope="gru_forward")
gru_cell_backward = StackedGRU([EMBEDDING_SIZE,], HIDDEN_SIZES, final_nonlinearity=tf.nn.relu6, scope="gru_backward")
sentence_parser   = BidirectionalSentenceParser(embedding, gru_cell_forward, gru_cell_backward)
classifier        = Classifier(2 * sum(HIDDEN_SIZES), len(MAIN_CATEGORIES))

# Define inputs
input_idxes    = tf.placeholder(tf.int32,   shape=(None, None,),        name="input_idxes")    # TIMESTEP  x BATCHSIZE
output_onehots = tf.placeholder(tf.float32, shape=(None, len(MAIN_CATEGORIES)), name="output_onehots") # BATCHSIZE x NUM_CLASSES

# execute the model
sentence_hidden  = sentence_parser.final_hidden(input_idxes)
error           = classifier.error(sentence_hidden, output_onehots)
num_correct     = classifier.num_correct(sentence_hidden, output_onehots)

# define optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_op  = optimizer.minimize(error)

In [31]:
session.run(tf.initialize_all_variables())

### Accuracy

In [32]:
def accuracy(examples, batch_size, dataset_name="dataset"):
    acc_num_correct, acc_num_total = 0, 0
    progress = get_pb("Accuracy on %s: " % (dataset_name,))
    batches = make_batches(examples, batch_size, sorting_key=lambda x:len(x.question))
    
    for batch in progress(batches):
        X, Ymain, _ = batch_examples(batch)
        batch_correct = session.run(num_correct, { input_idxes: X, output_onehots: Ymain})
        acc_num_correct += batch_correct
        acc_num_total   += len(batch)
    return acc_num_correct / acc_num_total

### Training

In [10]:
def batch_examples(examples):
    num_examples  = len(examples)
    longest_question = max(len(example.question) for example in examples)
    X     = np.empty((longest_question + 1, num_examples        ), dtype=np.int32)
    Ymain = np.zeros((num_examples,         len(MAIN_CATEGORIES)), dtype=np.float32)
    Ysub  = np.zeros((num_examples,         len(SUB_CATEGORIES) ), dtype=np.float32)
    for i, example in enumerate(examples):
        question, main_cat, sub_cat = example
        X[:, i] = vocab.encode(example.question, pad_eos=longest_question + 1)
        Ymain[i, MAIN_CATEGORIES.index(example.main_cat)] = 1.0
        Ysub [i, SUB_CATEGORIES.index(sub_cat)]  = 1.0
    return X, Ymain, Ysub

In [33]:
for epoch in range(10):
    batches = make_batches(train, BATCH_SIZE, sorting_key=lambda x:len(x.question))
    progress = get_pb("Epoch %d: " % (epoch,))
    for batch in progress(batches):
        X, Ymain, _ = batch_examples(batch)
        session.run(train_op, { input_idxes: X, output_onehots: Ymain})
    acc_train    = 100.0 * accuracy(train,    100, "train")
    acc_validate = 100.0 * accuracy(validate, 100, "validate")
    print("Epoch %d: accuracy on train: %.1f %%, validate: %.1f %%" % (epoch, acc_train, acc_validate))

Epoch 0: Time: 0:01:19 |######################################################| 100%
Accuracy on train: Time: 0:00:01 |############################################| 100%
Accuracy on validate: Time: 0:00:00 |#########################################| 100%
Epoch 1: Time: 0:01:18 |######################################################| 100%
Accuracy on train: Time: 0:00:01 |############################################| 100%
Accuracy on validate: ETA:  0:00:00 |###########################              |  66%

Epoch 0: accuracy on train: 80.7 %, validate: 73.2 %
Epoch 1: accuracy on train: 94.6 %, validate: 78.8 %

Accuracy on validate: Time: 0:00:00 |#########################################| 100%
Epoch 2: Time: 0:01:24 |######################################################| 100%
Accuracy on train: Time: 0:00:02 |############################################| 100%
Accuracy on validate: ETA:  0:00:00 |##################################       |  83%


Epoch 2: accuracy on train: 96.5 %, validate: 81.6 %

Accuracy on validate: Time: 0:00:00 |#########################################| 100%
Epoch 3: Time: 0:01:41 |######################################################| 100%
Accuracy on train: Time: 0:00:02 |############################################| 100%
Accuracy on validate: ETA:  0:00:00 |##################################       |  83%


Epoch 3: accuracy on train: 99.0 %, validate: 83.6 %

Accuracy on validate: Time: 0:00:00 |#########################################| 100%
Epoch 4: Time: 0:01:47 |######################################################| 100%
Accuracy on train: Time: 0:00:02 |############################################| 100%
Accuracy on validate: ETA:  0:00:00 |###########################              |  66%


Epoch 4: accuracy on train: 99.5 %, validate: 82.0 %

Accuracy on validate: Time: 0:00:00 |#########################################| 100%
Epoch 5: Time: 0:01:38 |######################################################| 100%
Accuracy on train: Time: 0:00:02 |############################################| 100%
Accuracy on validate: Time: 0:00:00 |#########################################| 100%


Epoch 5: accuracy on train: 96.2 %, validate: 73.6 %


Epoch 6: Time: 0:01:37 |######################################################| 100%
Accuracy on train: Time: 0:00:02 |############################################| 100%
Accuracy on validate: Time: 0:00:00 |#########################################| 100%


Epoch 6: accuracy on train: 99.8 %, validate: 83.3 %


Epoch 7: Time: 0:01:39 |######################################################| 100%
Accuracy on train: Time: 0:00:02 |############################################| 100%
Accuracy on validate: Time: 0:00:00 |#########################################| 100%


Epoch 7: accuracy on train: 98.3 %, validate: 78.4 %


Epoch 8: Time: 0:02:02 |######################################################| 100%
Accuracy on train: Time: 0:00:02 |############################################| 100%
Accuracy on validate: ETA:  0:00:00 |###########################              |  66%


Epoch 8: accuracy on train: 99.9 %, validate: 82.7 %

Accuracy on validate: Time: 0:00:00 |#########################################| 100%
Epoch 9: Time: 0:01:49 |######################################################| 100%
Accuracy on train: Time: 0:00:02 |############################################| 100%
Accuracy on validate: ETA:  0:00:00 |###########################              |  66%


Epoch 9: accuracy on train: 99.9 %, validate: 82.3 %


Accuracy on validate: ETA:  0:00:00 |##################################       |  83%Accuracy on validate: Time: 0:00:00 |#########################################| 100%


In [35]:
accuracy(test, 100)

Accuracy on dataset: Time: 0:00:00 |##########################################| 100%


0.86399999999999999