In [1]:
import numpy as np
import pandas as pd
import re
import os
from functools import reduce 
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F

In [2]:
text_files = os.listdir('./')
text_files = [i for i in text_files if '.txt' in i]


In [3]:
# making sure of all tasks have single word answers
for f in text_files:
    text1 = pd.read_csv('./'+f, sep="\n", header=None)
    text1.columns = ['text']
    ans = []
    for t in text1.text:
        if '?' in t:
            match = re.search(r'[a-zA-z0-9?\ ]*\t([\w \ ]+)', t)
            if match:
                ans.append(match.group(1)) 
                
    ans = [i.split(' ') for i in ans]
    for i in ans:
        if len(i)>1:
            print(i)

In [4]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

def tokenize(sent):
    return tokenizer.tokenize(sent)

def parse_stories(lines):
    data = []
    story = []
    nid_arr = []
    for line in lines:
        #line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            # reset story when line ID=1 (start of new story)
            story = []
            nid_arr = []
        if '\t' in line:
            # this line is tab separated Q, A &amp;amp;amp;amp;amp; support fact ID
            q, a, supporting = line.split('\t')
            # tokenize the words of question
            q = tokenize(q)
            # Provide all the sub-stories till this question
            substory = [x for x in story if x]
            # A story ends and is appended to global story data-set
            supporting = supporting.split()
            supporting = [int(i) for i in supporting]
            s = [i for i in range(len(nid_arr)) if nid_arr[i] in supporting]
            data.append((substory, q, a, s))
            story.append('')
        else:
            # this line is a sentence of story
            sent = tokenize(line)
            story.append(sent)
            nid_arr.append(nid)
    return data

def get_stories(f):
    # read the data file and parse 10k stories
    data = parse_stories(f.readlines())
    # lambda func to flatten the list of sentences into one list
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    # creating list of tuples for each story
    data = [((story), q, answer, s) for story, q, answer, s in data]
    #data = [((story), q, answer) for story, q, answer in data]
    return data

In [5]:
with open('./task_2.txt') as f:
    all_stories = get_stories(f)
    
train_stories, test_stories = train_test_split(all_stories, test_size=0.2)

In [242]:
all_stories[1]

([['Daniel', 'and', 'Sandra', 'went', 'to', 'the', 'office'],
  ['Sandra', 'and', 'Daniel', 'moved', 'to', 'the', 'bathroom'],
  ['Mary', 'and', 'John', 'travelled', 'to', 'the', 'hallway'],
  ['Daniel', 'and', 'John', 'journeyed', 'to', 'the', 'office']],
 ['Where', 'is', 'John'],
 'office',
 [3])

In [6]:
max_story_count = max([len(story[0]) for story in all_stories])
max_story_len = max([max([len(i) for i in story[0]]) for story in all_stories])

In [7]:
def padding_query(sequences, query_maxlen):
    """
    :param sequences: list of tensors
    :return:
    """
    num = len(sequences)
    out_tensor = np.zeros((num, query_maxlen))
    for i, tensor in enumerate(sequences):
        length = len(tensor)
        out_tensor[i, :length] = tensor
    return out_tensor

def padding_story(sequences, max_story_count, max_story_len):
    """
    :param sequences: list of tensors
    :return:
    """
    num = len(sequences)
    out_tensor = np.zeros((num, max_story_count, max_story_len))
    for i, stories in enumerate(sequences):
        for j, story in enumerate(stories):
            length = len(story)
            out_tensor[i, j, :length] = story
    return out_tensor

def padding_supporting(sequences, max_story_count):
    """
    :param sequences: list of tensors
    :return:
    """
    num = len(sequences)
    out_tensor = np.zeros((num, max_story_count))
    for i, s in enumerate(sequences):
        for j, t in enumerate(s):
            if t>0:
                out_tensor[i,t] = 1
    return out_tensor

def vectorize_stories(data, word_idx, query_maxlen, max_story_count, max_story_len):
    # story vector initialization
    X = []
    # query vector initialization
    Xq = []
    # answer vector intialization
    Y = []
    S = []
            
    for story, query, answer, supporting in data:
        # creating list of story word indices
        x = []
        for sen in story: 
            s = [word_idx[w] for w in sen]
            x.append(s)
        # creating list of query word indices
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx))
        # creating label 1 for the answer word index
        y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
        S.append(supporting)
        
    return (padding_story(X, max_story_count, max_story_len),
            padding_query(Xq, query_maxlen), np.array(Y), padding_supporting(S, max_story_count))

In [8]:
flatten = lambda data: reduce(lambda x, y: x + y, data)
vocab = set()
for story, q, answer, s in train_stories + test_stories:
    vocab |= set(flatten(story) + q + [answer])
    
vocab = sorted(vocab)
vocab_size = len(vocab) + 1
#story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
vocab = list(vocab)
query_maxlen = max(map(len, (x for _, x, _,_ in train_stories + test_stories)))
vocab = ['<pad>'] + vocab
word_idx = dict((c, i) for i, c in enumerate(vocab))
idx_word = dict((i, c) for i,c in enumerate(vocab))

inputs_train, queries_train, answers_train, supporting_train = vectorize_stories(train_stories,
                                                               word_idx,
                                                               query_maxlen, max_story_count, max_story_len)
inputs_test, queries_test, answers_test, supporting_test = vectorize_stories(test_stories,
                                                               word_idx,
                                                               query_maxlen, max_story_count, max_story_len)

In [9]:
class LSTM_with_attention(nn.Module):
    def __init__(self, vocab_size, story_embed_size, query_embed_size, story_hidden_dim, query_hidden_dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.story_embed_size = story_embed_size 
        self.story_hidden_dim = story_hidden_dim
        self.query_hidden_dim = query_hidden_dim
        self.story_embeddings = nn.Embedding(vocab_size, story_embed_size)
        self.query_embeddings = nn.Embedding(vocab_size, query_embed_size)
        self.story_lstm = nn.LSTM(story_embed_size*6, story_hidden_dim)
        self.query_lstm = nn.LSTM(query_embed_size, query_hidden_dim)
        self.story_hidden = self.init_story_hidden(max_story_count)  
        self.query_hidden = self.init_query_hidden(query_maxlen) 
        self.hidden2label = nn.Linear(story_hidden_dim*max_story_count+query_hidden_dim*query_maxlen, vocab_size)
        self.story_relevance = nn.Linear(story_hidden_dim*max_story_count, max_story_count)
        
    def forward(self, input_sequence, query, supported):
        supporting_stories = input_sequence*supported.unsqueeze(2)
        supporting_story_embed = self.story_embeddings(supporting_stories)
        all_supporting_story_embed = self.story_embeddings(input_sequence)
        query_embedding = self.query_embeddings(query)
        supporting_story_embed = supporting_story_embed.view(supporting_story_embed.shape[0], supporting_story_embed.shape[1], -1)
        all_supporting_story_embed = all_supporting_story_embed.view(all_supporting_story_embed.shape[0], all_supporting_story_embed.shape[1], -1)
        lstm_story_out, _ = self.story_lstm(supporting_story_embed, self.story_hidden)
        lstm_story_out_all, _ = self.story_lstm(all_supporting_story_embed, self.story_hidden)
        lstm_query_out, _ = self.query_lstm(query_embedding, self.query_hidden)
        s = lstm_story_out.view(len(lstm_story_out),-1)
        s_all = lstm_story_out_all.view(len(lstm_story_out_all),-1)
        q = lstm_query_out.view(len(lstm_query_out),-1)
        
        c = torch.cat([s,q], dim=1)
        y  = self.hidden2label(c)
        r = self.story_relevance(s_all)
        log_probs = F.log_softmax(y)
        log_probs_r = F.log_softmax(r)
        return log_probs, log_probs_r
        
    
        
    def init_story_hidden(self, max_len):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, max_len, self.story_hidden_dim)),
                autograd.Variable(torch.zeros(1, max_len, self.story_hidden_dim)))
    
    def init_query_hidden(self, max_len):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, max_len, self.query_hidden_dim)),
                autograd.Variable(torch.zeros(1, max_len, self.query_hidden_dim)))

def mutli_target_nll_loss(actual, predicted):
    m = (actual*predicted).sum(axis=1)
    loss = m.sum()
    return -loss

In [10]:
mem = LSTM_with_attention(len(vocab), 10, 2, 5, 2)
x=mem(torch.Tensor(inputs_train).long(), torch.Tensor(queries_train).long(), torch.Tensor(supporting_train).long())



In [11]:
torch.autograd.set_detect_anomaly(False)
def train(model, optimizer, epochs, story, query, supporting, answers):
    pad = word_idx['<pad>']
    for epoch in range(epochs):
        #scheduler.step()
        epoch_loss = 0
        optimizer.zero_grad()
        outputs, relevance = model(story, query, supporting)
        loss = F.nll_loss(outputs, answers.view(-1), ignore_index=pad, reduction='sum') +\
                mutli_target_nll_loss(supporting, relevance)
#         print(F.nll_loss(outputs, answers.view(-1), ignore_index=pad, reduction='sum'))
#         print(mutli_target_nll_loss(supporting, relevance))
        loss.backward(retain_graph=True)
        optimizer.step()
        if epoch%5==0:
            eval(model, story, query, supporting, answers)
        print(f"Epoch {epoch+1}: {loss.item()/story.shape[0]}")
     
def eval(model, story, query, supporting, answers):
    with torch.no_grad():
        _, relevance = model(story, query, supporting)
        relevance = np.exp(relevance)
        s = np.where(relevance>0.5, 1, 0)
        for i, val in enumerate(s):
            if sum(val)==0:
                third_val = sorted(relevance[i,:], reverse=True)[3]
                s[i,:] = np.where(relevance[i,:]>third_val, 1, 0)
        
        binary_s = torch.Tensor(s).long()
        outputs, _ = model(story, query, binary_s)
        
#         ans = np.argmax(answers, axis=1)
        predicted_ans = (np.argmax(outputs.detach().numpy(), axis=1))
#         print(predicted_ans)
#         print(answers)
        print(f"test accuracy: {np.mean(predicted_ans == answers.numpy())}")

In [12]:
mem = LSTM_with_attention(len(vocab), 10, 2, 5, 2)
optimizer = torch.optim.Adam(mem.parameters(), 0.1)
train(mem, optimizer, 80, torch.Tensor(inputs_train).long(), torch.Tensor(queries_train).long(), 
      torch.Tensor(supporting_train).long(), torch.Tensor(np.argmax(answers_train, axis=1)).long())





test accuracy: 0.10925
Epoch 1: 12.120236328125
Epoch 2: 13.6616494140625
Epoch 3: 21.87064453125
Epoch 4: 12.9696728515625
Epoch 5: 14.248076171875
test accuracy: 0.29075
Epoch 6: 10.3891533203125
Epoch 7: 9.7696201171875
Epoch 8: 8.95551953125
Epoch 9: 8.9431220703125
Epoch 10: 9.15401171875
test accuracy: 0.2715
Epoch 11: 8.95028125
Epoch 12: 8.74775390625
Epoch 13: 8.8492236328125
Epoch 14: 8.752359375
Epoch 15: 8.5243388671875
test accuracy: 0.29775
Epoch 16: 8.4611298828125
Epoch 17: 8.42096875
Epoch 18: 8.28250390625
Epoch 19: 8.08012158203125
Epoch 20: 7.91369970703125
test accuracy: 0.384
Epoch 21: 7.8010400390625
Epoch 22: 7.643806640625
Epoch 23: 7.47537255859375
Epoch 24: 7.38053857421875
Epoch 25: 7.27842333984375
test accuracy: 0.409
Epoch 26: 7.11930908203125
Epoch 27: 6.9864765625
Epoch 28: 6.92313818359375
Epoch 29: 6.822244140625
Epoch 30: 6.7232275390625
test accuracy: 0.3955
Epoch 31: 6.681439453125
Epoch 32: 6.6542607421875
Epoch 33: 6.61216943359375
Epoch 34: 6.57

###### 