In [1]:
import numpy as np
import pandas as pd
import re
import os
from functools import reduce 
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F

In [2]:
text_files = os.listdir('./tasks')
text_files = [i for i in text_files if '.txt' in i]


In [3]:
# making sure of all tasks have single word answers
for f in text_files:
    text1 = pd.read_csv('./tasks/'+f, sep="\n", header=None)
    text1.columns = ['text']
    ans = []
    for t in text1.text:
        if '?' in t:
            match = re.search(r'[a-zA-z0-9?\ ]*\t([\w \ ]+)', t)
            if match:
                ans.append(match.group(1)) 
                
    ans = [i.split(' ') for i in ans]
    for i in ans:
        if len(i)>1:
            print(i)

In [7]:
def tokenize(sent):
    return [ x.strip() for x in re.split(r'(\W+)', sent) if x.strip()]

def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        #line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            # reset story when line ID=1 (start of new story)
            story = []
        if '\t' in line:
            # this line is tab separated Q, A &amp;amp;amp;amp;amp; support fact ID
            q, a, supporting = line.split('\t')
            # tokenize the words of question
            q = tokenize(q)
            # Provide all the sub-stories till this question
            substory = [x for x in story if x]
            # A story ends and is appended to global story data-set
            data.append((substory, q, a))
            story.append('')
        else:
            # this line is a sentence of story
            sent = tokenize(line)
            story.append(sent)
    return data

def get_stories(f):
    # read the data file and parse 10k stories
    data = parse_stories(f.readlines())
    # lambda func to flatten the list of sentences into one list
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    # creating list of tuples for each story
    data = [(flatten(story), q, answer) for story, q, answer in data]
    #data = [((story), q, answer) for story, q, answer in data]
    return data

In [5]:
with open('./tasks/task_20.txt') as f:
    all_stories = get_stories(f)
    
train_stories, test_stories = train_test_split(all_stories, test_size=0.2)

In [6]:
train_stories[-1]

([['Yann', 'is', 'tired', '.'],
  ['Antoine', 'is', 'hungry', '.'],
  ['Jason', 'is', 'thirsty', '.'],
  ['Jason', 'went', 'to', 'the', 'kitchen', '.'],
  ['Jason', 'got', 'the', 'milk', '.'],
  ['Sumit', 'is', 'bored', '.'],
  ['Antoine', 'went', 'to', 'the', 'kitchen', '.']],
 ['Why', 'did', 'Antoine', 'go', 'to', 'the', 'kitchen', '?'],
 'hungry')

In [194]:
def padding_tensor(sequences):
    """
    :param sequences: list of tensors
    :return:
    """
    num = len(sequences)
    max_len = max([len(s) for s in sequences])
    out_dims = (num, max_len)
    out_tensor = np.zeros((num, max_len))
    for i, tensor in enumerate(sequences):
        length = len(tensor)
        out_tensor[i, :length] = tensor
    return out_tensor

In [195]:
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    # story vector initialization
    X = []
    # query vector initialization
    Xq = []
    # answer vector intialization
    Y = []
    for story, query, answer in data:
        # creating list of story word indices
        x = [word_idx[w] for w in story]
        # creating list of query word indices
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx))
        # creating label 1 for the answer word index
        y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (padding_tensor(X),
            padding_tensor(Xq), np.array(Y))

In [196]:
vocab = set()
for story, q, answer in train_stories + test_stories:
    vocab |= set(story + q + [answer])
    
vocab = sorted(vocab)
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))
vocab = list(vocab)
vocab = ['<pad>'] + vocab
word_idx = dict((c, i) for i, c in enumerate(vocab))
idx_word = dict((i, c) for i,c in enumerate(vocab))

inputs_train, queries_train, answers_train = vectorize_stories(train_stories,
                                                               word_idx,
                                                               story_maxlen,
                                                               query_maxlen)

inputs_test, queries_test, answers_test = vectorize_stories(test_stories,
                                                            word_idx,
                                                            story_maxlen,
                                                            query_maxlen)

In [198]:
class LSTM_no_attention(nn.Module):
    def __init__(self, vocab_size, story_embed_size, query_embed_size, story_hidden_dim, query_hidden_dim, dropout=0.2):
        super().__init__()
        self.vocab_size = vocab_size
        self.story_embed_size = story_embed_size
        self.query_embed_size = query_embed_size
        self.query_maxlen = query_maxlen
        self.story_hidden_dim = story_hidden_dim
        self.query_hidden_dim = query_hidden_dim
        self.story_embeddings = nn.Embedding(vocab_size, story_embed_size)
        self.query_embeddings = nn.Embedding(vocab_size, query_embed_size)
        self.story_lstm = nn.LSTM(story_embed_size, story_hidden_dim)
        self.query_lstm = nn.LSTM(query_embed_size, query_hidden_dim)
        self.hidden2label = nn.Linear(story_hidden_dim*story_maxlen+query_hidden_dim*query_maxlen, vocab_size)
        self.story_hidden = self.init_story_hidden(story_maxlen)  
        self.query_hidden = self.init_query_hidden(query_maxlen)  
        
    def forward(self, input_sequence, question):
        story_embed = self.story_embeddings(input_sequence)
        query_embed = self.query_embeddings(question)
        
        lstm_story_out, _ = self.story_lstm(story_embed, self.story_hidden)
        lstm_query_out, _ = self.query_lstm(query_embed, self.query_hidden)
        s = lstm_story_out.view(len(lstm_story_out),-1)
        q = lstm_query_out.view(len(lstm_query_out),-1)
        
        c = torch.cat([s,q], dim=1)
        y  = self.hidden2label(c)
        log_probs = F.log_softmax(y)
        return log_probs
    
        
    def init_story_hidden(self, max_len):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, max_len, self.story_hidden_dim)),
                autograd.Variable(torch.zeros(1, max_len, self.story_hidden_dim)))
    
    def init_query_hidden(self, max_len):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, max_len, self.query_hidden_dim)),
                autograd.Variable(torch.zeros(1, max_len, self.query_hidden_dim)))


In [199]:
mem = LSTM_no_attention(len(vocab), 10, 2, 5, 2)
x=mem(torch.Tensor(inputs_train).long(), torch.Tensor(queries_train).long())
x.shape



torch.Size([9600, 34])

In [200]:
torch.autograd.set_detect_anomaly(False)
def train(model, optimizer, epochs, story, query, answers):
    pad = word_idx['<pad>']
    for epoch in range(epochs):
        #scheduler.step()
        epoch_loss = 0
        optimizer.zero_grad()
        outputs = model(story, query)
        loss = F.nll_loss(outputs, answers.view(-1), ignore_index=pad, reduction='sum')
        loss.backward(retain_graph=True)
        optimizer.step()
        if epoch%5==0:
            eval(model)
        #print(f"Epoch {epoch+1}: {loss.item()/story.shape[0]}")
     

In [201]:
def eval(model):
    outputs = model(torch.Tensor(inputs_test).long(), torch.Tensor(queries_test).long())
    ans = np.argmax(answers_test, axis=1)
    predicted_ans = np.argmax(outputs.detach().numpy(), axis=1)
    print(f"test accuracy: {np.mean(predicted_ans == ans)}")

In [202]:
mem = LSTM_no_attention(len(vocab), 20, 2, 5, 2)
optimizer = torch.optim.Adam(mem.parameters(), 0.1)
train(mem, optimizer, 80, torch.Tensor(inputs_train).long(), torch.Tensor(queries_train).long(), torch.Tensor(np.argmax(answers_train, axis=1)).long())





test accuracy: 0.2575
test accuracy: 0.31166666666666665
test accuracy: 0.59125
test accuracy: 0.68875
test accuracy: 0.7545833333333334
test accuracy: 0.765
test accuracy: 0.78625
test accuracy: 0.80875
test accuracy: 0.8295833333333333
test accuracy: 0.8491666666666666
test accuracy: 0.8616666666666667
test accuracy: 0.86875
test accuracy: 0.87125
test accuracy: 0.8866666666666667
test accuracy: 0.8920833333333333
test accuracy: 0.8979166666666667


In [111]:
np.argmax(answers_train, axis=1)

(4000,)

In [96]:
torch.Tensor(queries_train).shape
len(vocab)

20