In [1]:
%load_ext autoreload
%autoreload 2

# Question answering demo

### Loading the data

In [29]:
from tf_nlp.data  import qa_qc
from tf_nlp.utils import validation_split, find_common_examples

In [30]:
train_and_validate, test = qa_qc("data/input4.txt", "data/test.txt")

print("Train set size: ", len(train))
print("Validate set size: ", len(validate))
print("Test  set size: ", len(test))
print("Example piece of data", train[0])
print("Intriguing properties: ",)
print("       a) quite a few duplicates detected: ", len(train_and_validate) - len(set(train_and_validate)), )
common = list(find_common_examples(train_and_validate, test))
print("       b) Number of examples shared between train and test: ", len(common))
print("          e.g: ", common[0])

Train set size:  4844
Validate set size:  538
Test  set size:  500
Example piece of data (('What', 'is', 'the', 'main', 'language', 'of', 'Sao', 'Paulo', ',', 'Brazil', '?'), 'ENTY', 'lang')
Intriguing properties: 
       a) quite a few duplicates detected:  70
       b) Number of examples shared between train and test:  10
          e.g:  QaQcDatum(question=('What', 'is', 'viscosity', '?'), main_cat='DESC', sub_cat='def')


In [31]:
# deduplication and validation split
train_and_validate = list(set(train_and_validate))
train, validate = validation_split(train_and_validate, 0.1)

##### Building the Vocab

In [72]:
from tf_nlp       import Vocab 
from tf_nlp.utils import flatten

In [73]:
def extract_questions(dataset):
    return [example.question for example in dataset]

In [74]:
naive_vocab = Vocab(extract_questions(train))
print("Percentage of unknown words in train: ", 100.0 * naive_vocab.fraction_unknown(extract_questions(train)))
print("Percentage of unknown words in validate: ", 100.0 * naive_vocab.fraction_unknown(extract_questions(validate)))

Percentage of unknown words in train:  0.0
Percentage of unknown words in validate:  11.273257935060196


In [83]:
popular_words = Vocab.keep_n_most_frequent(flatten(extract_questions(train)), round(0.9 * len(naive_vocab)))
vocab = Vocab(popular_words)
print("Percentage of unknown words in train: ", 100.0 * vocab.fraction_unknown(extract_questions(train)))
print("Percentage of unknown words in validate: ", 100.0 * vocab.fraction_unknown(extract_questions(validate)))

Percentage of unknown words in train:  1.785425428663863
Percentage of unknown words in validate:  12.331265961327983


### Building the model

In [217]:
import numpy as np
import random
import tensorflow as tf

from tensorflow.python.ops import functional_ops # for scan!
from tf_nlp.models import Linear, GRU, Embedding
from tf_nlp.utils  import get_pb, make_batches

In [157]:
VOCAB_SIZE = len(vocab)

MAIN_CATEGORIES = list(set(ex.main_cat for ex in train_and_validate))
SUB_CATEGORIES  = list(set(ex.sub_cat  for ex in train_and_validate))

EMBEDDING_SIZE = 50
HIDDEN_SIZE    = 100
BATCH_SIZE     = 5

In [158]:
def batch_examples(examples):
    num_examples  = len(examples)
    longest_question = max(len(example.question) for example in examples)
    X     = np.empty((longest_question + 1, num_examples        ), dtype=np.int32)
    Ymain = np.zeros((num_examples,         len(MAIN_CATEGORIES)), dtype=np.float32)
    Ysub  = np.zeros((num_examples,         len(SUB_CATEGORIES) ), dtype=np.float32)
    for i, example in enumerate(examples):
        question, main_cat, sub_cat = example
        X[:, i] = vocab.encode(example.question, pad_eos=longest_question + 1)
        Ymain[i, MAIN_CATEGORIES.index(example.main_cat)] = 1.0
        Ysub [i, SUB_CATEGORIES.index(sub_cat)]  = 1.0
    return X, Ymain, Ysub

In [207]:
if 'session' in globals():
    session.close()
tf.reset_default_graph()
session = tf.InteractiveSession()

In [208]:
# Define model componnents
embedding       = Embedding(VOCAB_SIZE, EMBEDDING_SIZE)
gru_cell        = GRU([EMBEDDING_SIZE,], HIDDEN_SIZE, final_nonlinearity=tf.nn.relu6)
sentence_parser = SentenceParser(embedding, gru_cell)
classifier      = Classifier(HIDDEN_SIZE, len(MAIN_CATEGORIES))

# Define inputs
input_idxes    = tf.placeholder(tf.int32,   shape=(None, None,),        name="input_idxes")    # TIMESTEP  x BATCHSIZE
output_onehots = tf.placeholder(tf.float32, shape=(None, NUM_MAIN_CAT), name="output_onehots") # BATCHSIZE x NUM_CLASSES

# execute the model
sentence_hidden = sentence_parser.final_hidden(input_idxes)
error           = classifier.error(sentence_hidden, output_onehots)
num_correct     = classifier.num_correct(sentence_hidden, output_onehots)

# define optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train_op  = optimizer.minimize(error)

In [209]:
session.run(tf.initialize_all_variables())

### Accuracy

In [218]:
def accuracy(examples, batch_size, dataset_name="dataset"):
    acc_num_correct, acc_num_total = 0, 0
    progress = get_pb("Accuracy on %s: " % (dataset_name,))
    batches = make_batches(examples, batch_size, sorting_key=lambda x:len(x.question))
    
    for batch in progress(batches):
        X, Ymain, _ = batch_examples(batch)
        batch_correct = session.run(num_correct, { input_idxes: X, output_onehots: Ymain})
        acc_num_correct += batch_correct
        acc_num_total   += len(batch)
    return acc_num_correct / acc_num_total

### Training

In [219]:
for epoch in range(10):
    batches = make_batches(train, BATCH_SIZE, sorting_key=lambda x:len(x.question))
    progress = get_pb("Epoch %d: " % (epoch,))
    for batch in progress(batches):
        X, Ymain, _ = batch_examples(batch)
        session.run(train_op, { input_idxes: X, output_onehots: Ymain})
    acc_train    = 100.0 * accuracy(train,    100, "train")
    acc_validate = 100.0 * accuracy(validate, 100, "validate")
    print("Epoch %d: accuracy on train: %.1f %%, validate: %.1f %%" % (epoch, acc_train, acc_validate))

Epoch 0: Time: 0:00:30 |######################################################| 100%
Accuracy on train: Time: 0:00:00 |############################################| 100%
Accuracy on validate: Time: 0:00:00 |#########################################| 100%
Epoch 1: Time: 0:00:31 |######################################################| 100%
Accuracy on train: Time: 0:00:00 |############################################| 100%
Accuracy on validate: ETA:  0:00:00 |####################                     |  50%

Epoch 0: accuracy on train: 99.1 %, validate: 84.8 %
Epoch 1: accuracy on train: 99.6 %, validate: 86.1 %

Accuracy on validate: Time: 0:00:00 |#########################################| 100%
Epoch 2: Time: 0:00:30 |######################################################| 100%
Accuracy on train: Time: 0:00:00 |############################################| 100%
Accuracy on validate: Time: 0:00:00 |#########################################| 100%


Epoch 2: accuracy on train: 99.9 %, validate: 85.5 %


Epoch 3: Time: 0:00:31 |######################################################| 100%
Accuracy on train: Time: 0:00:00 |############################################| 100%
Accuracy on validate: ETA:  0:00:00 |####################                     |  50%


Epoch 3: accuracy on train: 99.7 %, validate: 84.8 %

Accuracy on validate: Time: 0:00:00 |#########################################| 100%
Epoch 4: ETA:  0:00:26 |########                                              |  15%




KeyboardInterrupt: 

In [None]:
accuracy(test)