In [1]:
import numpy as np
import pandas as pd
import re
import os
from functools import reduce 
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F

In [2]:
text_files = os.listdir('./')
text_files = [i for i in text_files if '.txt' in i]


In [3]:
# making sure of all tasks have single word answers
for f in text_files:
    text1 = pd.read_csv('./'+f, sep="\n", header=None)
    text1.columns = ['text']
    ans = []
    for t in text1.text:
        if '?' in t:
            match = re.search(r'[a-zA-z0-9?\ ]*\t([\w \ ]+)', t)
            if match:
                ans.append(match.group(1)) 
                
    ans = [i.split(' ') for i in ans]
    for i in ans:
        if len(i)>1:
            print(i)

In [5]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

def tokenize(sent):
    return tokenizer.tokenize(sent)

def parse_stories(lines):
    data = []
    story = []
    nid_arr = []
    for line in lines:
        #line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            # reset story when line ID=1 (start of new story)
            story = []
            nid_arr = []
        if '\t' in line:
            # this line is tab separated Q, A &amp;amp;amp;amp;amp; support fact ID
            q, a, supporting = line.split('\t')
            # tokenize the words of question
            q = tokenize(q)
            # Provide all the sub-stories till this question
            substory = [x for x in story if x]
            # A story ends and is appended to global story data-set
            supporting = supporting.split()
            supporting = [int(i) for i in supporting]
            s = [i for i in range(len(nid_arr)) if nid_arr[i] in supporting]
            data.append((substory, q, a, s))
            story.append('')
        else:
            # this line is a sentence of story
            sent = tokenize(line)
            story.append(sent)
            nid_arr.append(nid)
    return data

def get_stories(f):
    # read the data file and parse 10k stories
    data = parse_stories(f.readlines())
    # lambda func to flatten the list of sentences into one list
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    # creating list of tuples for each story
    data = [((story), q, answer, s) for story, q, answer, s in data]
    #data = [((story), q, answer) for story, q, answer in data]
    return data

In [17]:
all_stories[1]

([['John', 'moved', 'to', 'the', 'garden'],
  ['John', 'moved', 'to', 'the', 'bedroom'],
  ['Mary', 'moved', 'to', 'the', 'garden'],
  ['Daniel', 'went', 'to', 'the', 'office']],
 ['Where', 'is', 'Mary'],
 'garden',
 [2])

In [19]:
def padding_query(sequences, query_maxlen):
    """
    :param sequences: list of tensors
    :return:
    """
    num = len(sequences)
    out_tensor = np.zeros((num, query_maxlen))
    for i, tensor in enumerate(sequences):
        length = len(tensor)
        out_tensor[i, :length] = tensor
    return out_tensor

def padding_story(sequences, max_story_count, max_story_len):
    """
    :param sequences: list of tensors
    :return:
    """
    num = len(sequences)
    out_tensor = np.zeros((num, max_story_count, max_story_len))
    for i, stories in enumerate(sequences):
        for j, story in enumerate(stories):
            length = len(story)
            out_tensor[i, j, :length] = story
    return out_tensor

def padding_supporting(sequences, max_story_count):
    """
    :param sequences: list of tensors
    :return:
    """
    num = len(sequences)
    out_tensor = np.zeros((num, max_story_count))
    for i, s in enumerate(sequences):
        for j, t in enumerate(s):
            if t>0:
                out_tensor[i,t] = 1
    return out_tensor

def vectorize_stories(data, word_idx, query_maxlen, max_story_count, max_story_len):
    # story vector initialization
    X = []
    # query vector initialization
    Xq = []
    # answer vector intialization
    Y = []
    S = []
            
    for story, query, answer, supporting in data:
        # creating list of story word indices
        x = []
        for sen in story: 
            s = [word_idx[w] for w in sen]
            x.append(s)
        # creating list of query word indices
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx))
        # creating label 1 for the answer word index
        y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        S.append(supporting)
        
    return (padding_story(X, max_story_count, max_story_len),
            padding_query(Xq, query_maxlen), np.array(Y), padding_supporting(S, max_story_count))

In [47]:
def preprocess_data(task):
    with open(f'./task_{task}.txt') as f:
        all_stories = get_stories(f)

    train_stories, test_stories = train_test_split(all_stories, test_size=0.2)
    max_story_count = max([len(story[0]) for story in all_stories])
    max_story_len = max([max([len(i) for i in story[0]]) for story in all_stories])

    flatten = lambda data: reduce(lambda x, y: x + y, data)
    vocab = set()
    for story, q, answer, s in train_stories + test_stories:
        vocab |= set(flatten(story) + q + [answer])

    vocab = sorted(vocab)
    vocab_size = len(vocab) + 1
    #story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
    vocab = list(vocab)
    query_maxlen = max(map(len, (x for _, x, _,_ in train_stories + test_stories)))
    vocab = ['<pad>'] + vocab
    word_idx = dict((c, i) for i, c in enumerate(vocab))
    idx_word = dict((i, c) for i,c in enumerate(vocab))

    train = vectorize_stories(train_stories,
                               word_idx,
                               query_maxlen, max_story_count, max_story_len)
    test = vectorize_stories(test_stories,
                               word_idx,
                               query_maxlen, max_story_count, max_story_len)
    return train, test, vocab, max_story_count, max_story_len, query_maxlen, word_idx

In [66]:
class LSTM_with_attention(nn.Module):
    def __init__(self, vocab_size, story_embed_size, query_embed_size, story_hidden_dim, query_hidden_dim, max_story_count, query_maxlen, max_story_len):
        super().__init__()
        self.vocab_size = vocab_size
        self.story_embed_size = story_embed_size 
        self.story_hidden_dim = story_hidden_dim
        self.query_hidden_dim = query_hidden_dim
        self.story_embeddings = nn.Embedding(vocab_size, story_embed_size)
        self.query_embeddings = nn.Embedding(vocab_size, query_embed_size)
        self.story_lstm = nn.LSTM(story_embed_size*max_story_len, story_hidden_dim)
        self.query_lstm = nn.LSTM(query_embed_size, query_hidden_dim)
        self.story_hidden = self.init_story_hidden(max_story_count)  
        self.query_hidden = self.init_query_hidden(query_maxlen) 
        self.hidden2label = nn.Linear(story_hidden_dim*max_story_count+query_hidden_dim*query_maxlen, vocab_size)
        self.story_relevance = nn.Linear(story_hidden_dim*max_story_count, max_story_count)
        
    def forward(self, input_sequence, query, supported):
        supporting_stories = input_sequence*supported.unsqueeze(2)
        supporting_story_embed = self.story_embeddings(supporting_stories)
        all_supporting_story_embed = self.story_embeddings(input_sequence)
        query_embedding = self.query_embeddings(query)
        supporting_story_embed = supporting_story_embed.view(supporting_story_embed.shape[0], supporting_story_embed.shape[1], -1)
        all_supporting_story_embed = all_supporting_story_embed.view(all_supporting_story_embed.shape[0], all_supporting_story_embed.shape[1], -1)
        lstm_story_out, _ = self.story_lstm(supporting_story_embed, self.story_hidden)
        lstm_story_out_all, _ = self.story_lstm(all_supporting_story_embed, self.story_hidden)
        lstm_query_out, _ = self.query_lstm(query_embedding, self.query_hidden)
        s = lstm_story_out.view(len(lstm_story_out),-1)
        s_all = lstm_story_out_all.view(len(lstm_story_out_all),-1)
        q = lstm_query_out.view(len(lstm_query_out),-1)
        
        c = torch.cat([s,q], dim=1)
        y  = self.hidden2label(c)
        r = self.story_relevance(s_all)
        log_probs = F.log_softmax(y)
        log_probs_r = F.log_softmax(r)
        return log_probs, log_probs_r
        
    
        
    def init_story_hidden(self, max_len):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, max_len, self.story_hidden_dim)),
                autograd.Variable(torch.zeros(1, max_len, self.story_hidden_dim)))
    
    def init_query_hidden(self, max_len):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, max_len, self.query_hidden_dim)),
                autograd.Variable(torch.zeros(1, max_len, self.query_hidden_dim)))

def mutli_target_nll_loss(actual, predicted):
    m = (actual*predicted).sum(axis=1)
    loss = m.sum()
    return -loss

In [45]:
mem = LSTM_with_attention(len(vocab), 10, 2, 5, 2)
x=mem(torch.Tensor(inputs_train).long(), torch.Tensor(queries_train).long(), torch.Tensor(supporting_train).long())



In [79]:
torch.autograd.set_detect_anomaly(False)
def train_model(model, optimizer, epochs, story, query, supporting, answers, test):
    pad = word_idx['<pad>']
    test_accuracy = []
    for epoch in range(epochs):
        epoch_loss = 0
        optimizer.zero_grad()
        outputs, relevance = model(story, query, supporting)
        loss = F.nll_loss(outputs, answers.view(-1), ignore_index=pad, reduction='sum') +\
                mutli_target_nll_loss(supporting, relevance)

        loss.backward(retain_graph=True)
        optimizer.step()
        if (epoch+1)%10==0:
            test_accuracy.append(eval(model, torch.Tensor(test[0]).long(), torch.Tensor(test[1]).long(), 
      torch.Tensor(test[3]).long(), torch.Tensor(np.argmax(test[2], axis=1)).long()))
#             print(f"Epoch {epoch+1}: {loss.item()/story.shape[0]}")
    return max(test_accuracy)
     
def eval(model, story, query, supporting, answers):
    with torch.no_grad():
        _, relevance = model(story, query, supporting)
        relevance = np.exp(relevance)
        s = np.where(relevance>0.5, 1, 0)
        for i, val in enumerate(s):
            if sum(val)==0:
                third_val = sorted(relevance[i,:], reverse=True)[1]
                s[i,:] = np.where(relevance[i,:]>third_val, 1, 0)
        
        binary_s = torch.Tensor(s).long()
        outputs, _ = model(story, query, binary_s)
        
        predicted_ans = (np.argmax(outputs.detach().numpy(), axis=1))

        return (np.mean(predicted_ans == answers.numpy()))

In [80]:
for task in range(1,21):
    train, test, vocab, max_story_count, max_story_len, query_maxlen, word_idx = preprocess_data(task)

    mem = LSTM_with_attention(len(vocab), 10, 5, 4, 2, max_story_count, query_maxlen, max_story_len)
    optimizer = torch.optim.Adam(mem.parameters(), 0.1)
    test_accuracy = train_model(mem, optimizer, 100, torch.Tensor(train[0]).long(), torch.Tensor(train[1]).long(), 
          torch.Tensor(train[3]).long(), torch.Tensor(np.argmax(train[2], axis=1)).long(), test)
    print(f"Task {task}: {np.round(test_accuracy*100,2)}%")



Task 1: 24.4%
Task 2: 44.7%
Task 3: 44.8%
Task 4: 27.5%
Task 5: 47.1%
Task 6: 51.7%
Task 7: 53.8%
Task 8: 38.9%
Task 9: 63.4%
Task 10: 82.2%
Task 11: 20.1%
Task 12: 16.5%
Task 13: 33.7%
Task 14: 21.9%
Task 15: 34.12%
Task 16: 36.0%
Task 17: 53.44%
Task 18: 88.95%
Task 19: 10.5%
Task 20: 75.42%


###### 