In [7]:
import numpy as np
import pandas as pd
import re
import os
from functools import reduce 
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F

In [8]:
text_files = os.listdir('./tasks')
text_files = [i for i in text_files if '.txt' in i]


In [9]:
# making sure of all tasks have single word answers
for f in text_files:
    text1 = pd.read_csv('./tasks/'+f, sep="\n", header=None)
    text1.columns = ['text']
    ans = []
    for t in text1.text:
        if '?' in t:
            match = re.search(r'[a-zA-z0-9?\ ]*\t([\w \ ]+)', t)
            if match:
                ans.append(match.group(1)) 
                
    ans = [i.split(' ') for i in ans]
    for i in ans:
        if len(i)>1:
            print(i)

In [13]:
def prepare_data(task):
    with open(f'./tasks/task_{task}.txt') as f:
        all_stories = get_stories(f, flatten=True)

    train_stories, test_stories = train_test_split(all_stories, test_size=0.2)

    vocab = set()
    for story, q, answer in train_stories + test_stories:
        vocab |= set(story + q + [answer])

    vocab = sorted(vocab)
    vocab_size = len(vocab) + 1
    story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
    query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))
    vocab = list(vocab)
    vocab = ['<pad>'] + vocab
    word_idx = dict((c, i) for i, c in enumerate(vocab))
    idx_word = dict((i, c) for i,c in enumerate(vocab))

    train = vectorize_stories(train_stories,
                               word_idx,
                               story_maxlen,
                               query_maxlen)

    test = vectorize_stories(test_stories,
                            word_idx,
                            story_maxlen,
                            query_maxlen)
    
    return train, test, vocab, word_idx, query_maxlen, story_maxlen

In [14]:
class LSTM_no_attention(nn.Module):
    def __init__(self, vocab_size, story_embed_size, query_embed_size, story_hidden_dim, query_hidden_dim, query_maxlen, story_maxlen, dropout=0.2):
        super().__init__()
        self.vocab_size = vocab_size
        self.story_embed_size = story_embed_size
        self.query_embed_size = query_embed_size
        self.query_maxlen = query_maxlen
        self.story_hidden_dim = story_hidden_dim
        self.query_hidden_dim = query_hidden_dim
        self.story_embeddings = nn.Embedding(vocab_size, story_embed_size)
        self.query_embeddings = nn.Embedding(vocab_size, query_embed_size)
        # two lstm modules for story and query
        self.story_lstm = nn.LSTM(story_embed_size, story_hidden_dim)
        self.query_lstm = nn.LSTM(query_embed_size, query_hidden_dim)
        #linear layer to convert to vocab size
        self.hidden2label = nn.Linear(story_hidden_dim*story_maxlen+query_hidden_dim*query_maxlen, vocab_size)
        #hidden layer initilaization for both the lstms
        self.story_hidden = self.init_story_hidden(story_maxlen)  
        self.query_hidden = self.init_query_hidden(query_maxlen)  
        
    def forward(self, input_sequence, question):
        story_embed = self.story_embeddings(input_sequence)
        query_embed = self.query_embeddings(question)
        
        lstm_story_out, _ = self.story_lstm(story_embed, self.story_hidden)
        lstm_query_out, _ = self.query_lstm(query_embed, self.query_hidden)
        s = lstm_story_out.view(len(lstm_story_out),-1)
        q = lstm_query_out.view(len(lstm_query_out),-1)
        
        c = torch.cat([s,q], dim=1)
        y  = self.hidden2label(c)
        log_probs = F.log_softmax(y)
        return log_probs
    
        
    def init_story_hidden(self, max_len):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, max_len, self.story_hidden_dim)),
                autograd.Variable(torch.zeros(1, max_len, self.story_hidden_dim)))
    
    def init_query_hidden(self, max_len):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, max_len, self.query_hidden_dim)),
                autograd.Variable(torch.zeros(1, max_len, self.query_hidden_dim)))


In [15]:
torch.autograd.set_detect_anomaly(False)
def train_model(model, optimizer, epochs, story, query, answers, test, word_idx):
    test_accuracy = []
    pad = word_idx['<pad>']
    for epoch in range(epochs):
        #scheduler.step()
        epoch_loss = 0
        optimizer.zero_grad()
        outputs = model(story, query)
        loss = F.nll_loss(outputs, answers.view(-1), ignore_index=pad, reduction='sum')
        loss.backward(retain_graph=True)
        optimizer.step()
        if epoch%5==0:
            test_accuracy.append(eval(model, test))
    return max(test_accuracy)
     
def eval(model, test):
    outputs = model(torch.Tensor(test[0]).long(), torch.Tensor(test[1]).long())
    ans = np.argmax(test[2], axis=1)
    predicted_ans = np.argmax(outputs.detach().numpy(), axis=1)
    return np.mean(predicted_ans == ans)

In [16]:
for task in range(20,21):
    train, test, vocab, word_idx, query_maxlen, story_maxlen = prepare_data(task)
    inputs_train = torch.Tensor(train[0]).long()
    queries_train = torch.Tensor(train[1]).long()
    answers_train = torch.Tensor(np.argmax(train[2], axis=1)).long()

    mem = LSTM_no_attention(len(vocab), 20, 10, 5, 2, query_maxlen, story_maxlen)
    optimizer = torch.optim.Adam(mem.parameters(), 0.1)
    test_accuracy = train_model(mem, optimizer, 100, inputs_train, queries_train, answers_train, test, word_idx)
    print(f"Task {task}: {np.round(test_accuracy*100,2)}%")




Task 20: 81.38%
