In [161]:
import numpy as np
import pandas as pd
import re
import os
from functools import reduce 
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report,accuracy_score
import sklearn.metrics as metrics
import re
import warnings

warnings.filterwarnings("ignore")

In [162]:
!mkdir tasks

A subdirectory or file tasks already exists.


In [163]:
text_files = os.listdir('./tasks')
text_files = [i for i in text_files if '.txt' in i]


In [164]:
task=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]

### Unigrams

In [165]:
# making sure of all tasks have single word answers
for f in text_files:
    text1 = pd.read_csv('./tasks/'+f, sep="\n", header=None)
    text1.columns = ['text']
    ans = []
    for t in text1.text:
        if '?' in t:
            match = re.search(r'[a-zA-z0-9?\ ]*\t([\w \ ]+)', t)
            if match:
                ans.append(match.group(1)) 
                
    ans = [i.split(' ') for i in ans]
    for i in ans:
        if len(i)>1:
            print(i)

In [176]:
def tokenize(sent):
    return [ x.strip() for x in re.split(r'(\W+)', sent) if x.strip()]

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        
        #line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            # reset story when line ID=1 (start of new story)
            story = []
        if '\t' in line:
            # this line is tab separated Q, A &amp;amp;amp;amp;amp; support fact ID
            q, a, supporting = line.split('\t')
            # tokenize the words of question
            q = tokenize(q)
            # Provide all the sub-stories till this question
            substory = [x for x in story if x]
            # A story ends and is appended to global story data-set
            data.append((substory, q, a))
            story.append('')
        else:
            # this line is a sentence of story
#             bigram_story = generate_ngrams(line, n)
            sent = tokenize(line)
            story.append(sent)
    return data

def get_stories(f):
    # read the data file and parse 10k stories
    data = parse_stories(f.readlines())
    # lambda func to flatten the list of sentences into one list
    #flatten = lambda data: reduce(lambda x, y: x + y, data)
    # creating list of tuples for each story
    #data = [(flatten(story), q, answer) for story, q, answer in data]
    data = [((story), q, answer) for story, q, answer in data]
    return data

In [167]:
def padding_tensor(sequences,max_len):
    """
    :param sequences: list of tensors
    :return:
    """
    num = len(sequences)
    #max_len = max([len(s) for s in sequences])
    out_dims = (num, max_len)
    out_tensor = np.zeros((num, max_len))
    for i, tensor in enumerate(sequences):
        length = len(tensor)
        out_tensor[i, :length] = tensor
    return out_tensor


def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    # story vector initialization
    X = []
    # query vector initialization
    Xq = []
    # answer vector intialization
    Y = []
    for story, query, answer in data:
        # creating list of story word indices
        x = [word_idx[w] for w in story]
        # creating list of query word indices
        xq = [word_idx[w] for w in query]
        # let's not forget that index 0 is reserved
        y = np.zeros(len(word_idx))
        # creating label 1 for the answer word index
        y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)
    return (padding_tensor(X,story_maxlen),
            padding_tensor(Xq,query_maxlen), np.array(Y))

In [180]:
accuracies_unigram=[]
for f_i in range(len(task)):
    file_path='./tasks/task_'+str(task[f_i])+".txt"
    #print(file_path)
    with open(file_path) as f:
        all_stories = get_stories(f)
    train_stories, test_stories = train_test_split(all_stories, test_size=0.2, random_state=42)
    
    # creating the filtered qa sets
    qa_sets_filtered_train=[]
    for q_i in range(len(train_stories)):
        # select only those sentences which have at least one word common with the question
        results=[]
        for i in range(len(train_stories[q_i][0])):

            if(len(list(set(train_stories[q_i][0][i]) & set(train_stories[q_i][1])))>0):
                results.append(train_stories[q_i][0][i])
        qa_set_i= (results,train_stories[q_i][1],train_stories[q_i][2])
        qa_sets_filtered_train.append(qa_set_i)
        
    #lambda func to flatten the list of sentences into one list
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    #creating list of tuples for each story
    data_filtered = [(flatten(story), q, answer) for story, q, answer in qa_sets_filtered_train]
    
    # creating the filtered qa sets
    qa_sets_filtered_test=[]
    for q_i in range(len(test_stories)):
        # select only those sentences which have at least one word common with the question
        results=[]
        for i in range(len(test_stories[q_i][0])):

            if(len(list(set(test_stories[q_i][0][i]) & set(test_stories[q_i][1])))>0):
                results.append(test_stories[q_i][0][i])
        qa_set_i= (results,test_stories[q_i][1],test_stories[q_i][2])
        qa_sets_filtered_test.append(qa_set_i)
        
    #lambda func to flatten the list of sentences into one list
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    #creating list of tuples for each story
    data_filtered_test = [(flatten(story), q, answer) for story, q, answer in qa_sets_filtered_test]
    
    vocab = set()
    for story, q, answer in (data_filtered + data_filtered_test):
        vocab |= set(story + q + [answer])

    vocab = sorted(vocab)
    vocab_size = len(vocab) + 1
    story_maxlen = max(map(len, (x for x, _, _ in data_filtered + data_filtered_test)))
    query_maxlen = max(map(len, (x for _, x, _ in data_filtered + data_filtered_test)))
    vocab = list(vocab)
    vocab = ['<pad>'] + vocab
    word_idx = dict((c, i) for i, c in enumerate(vocab))
    idx_word = dict((i, c) for i,c in enumerate(vocab))

    inputs_train, queries_train, answers_train = vectorize_stories(data_filtered,
                                                                   word_idx,
                                                                   story_maxlen,
                                                                   query_maxlen)

    inputs_test, queries_test, answers_test = vectorize_stories(data_filtered_test,
                                                                word_idx,
                                                                story_maxlen,
                                                                query_maxlen)
    
    features_train=np.column_stack((queries_train,inputs_train))
    target_train=np.where(answers_train == 1)[1].reshape(answers_train.shape[0],1)
    
    features_test=np.column_stack((queries_test,inputs_test))
    target_test=np.where(answers_test == 1)[1].reshape(answers_test.shape[0],1)
    
    model = LinearSVC(random_state=50)
    model.fit(features_train, target_train)
    predictions = model.predict(features_test)
    #print(len(np.where(answers_test == 1)[1]))
    accuracies_unigram.append(round((accuracy_score(np.where(answers_test == 1)[1], predictions))*100,3))
    print("task "+str(task[f_i])+": "+str(round((accuracy_score(np.where(answers_test == 1)[1], predictions))*100,3))+"%")
    

task 1: 41.5%
task 2: 28.7%
task 3: 30.8%
task 4: 20.5%
task 5: 33.9%
task 6: 52.3%
task 7: 65.0%
task 8: 43.5%
task 9: 53.2%
task 10: 69.9%
task 11: 19.8%
task 12: 27.3%
task 13: 17.5%
task 14: 18.7%
task 15: 26.125%
task 16: 25.0%
task 17: 48.125%
task 18: 56.008%
task 19: 8.0%
task 20: 84.417%


### Bigrams

In [169]:
def tokenize(sent):
    return [ x.strip() for x in re.split(r'(\W+)', sent) if x.strip()]

def generate_ngrams(s, n):
    # Convert to lowercases
    #s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token.strip(' \n ') for token in s.split(" ") if token != ""]
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

def parse_stories(lines,n):
    data = []
    story = []
    for line in lines:
        
        #line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            # reset story when line ID=1 (start of new story)
            story = []
        if '\t' in line:
            # this line is tab separated Q, A &amp;amp;amp;amp;amp; support fact ID
            q, a, supporting = line.split('\t')
            # tokenize the words of question
            q = tokenize(q)
            # Provide all the sub-stories till this question
            substory = [x for x in story if x]
            # A story ends and is appended to global story data-set
            data.append((substory, q, a))
            story.append('')
        else:
            # this line is a sentence of story
            bigram_story = generate_ngrams(line, n)
            #sent = tokenize(line)
            story.append(bigram_story)
    return data

def get_stories(f,n):
    # read the data file and parse 10k stories
    data = parse_stories(f.readlines(),n)
    # lambda func to flatten the list of sentences into one list
    #flatten = lambda data: reduce(lambda x, y: x + y, data)
    # creating list of tuples for each story
    #data = [(flatten(story), q, answer) for story, q, answer in data]
    data = [((story), q, answer) for story, q, answer in data]
    return data

In [170]:
accuracies_bigram=[]
for f_i in range(len(task)):
    file_path='./tasks/task_'+str(task[f_i])+".txt"
    #print(file_path)
    with open(file_path) as f:
        all_stories = get_stories(f,n=2)
    train_stories, test_stories = train_test_split(all_stories, test_size=0.2, random_state=42)

    # creating the filtered qa sets
    qa_sets_filtered_train=[]
    for q_i in range(len(train_stories)):
        # select only those sentences which have at least one word common with the question
        results=[]
        for i in range(len(train_stories[q_i][0])):
            q_array=train_stories[q_i][1]
            check=[]
            for qsi in range(len(q_array)):
                res = [k for k in train_stories[q_i][0][i] if q_array[qsi] in k]
                check.append(len(res))
            #print(check)
            #print(sum(check))
            if(sum(check)>0):
                results.append(train_stories[q_i][0][i])
        qa_set_i= (results,train_stories[q_i][1],train_stories[q_i][2])
        qa_sets_filtered_train.append(qa_set_i)
    
    #print(qa_sets_filtered_train)
    #lambda func to flatten the list of sentences into one list
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    #creating list of tuples for each story
    data_filtered = [(flatten(story), q, answer) for story, q, answer in qa_sets_filtered_train]
        
    # creating the filtered qa sets----test data
    qa_sets_filtered_test=[]
    for q_i in range(len(test_stories)):
        # select only those sentences which have at least one word common with the question
        results=[]
        for i in range(len(test_stories[q_i][0])):
            q_array=test_stories[q_i][1]
            check=[]
            for qsi in range(len(q_array)):
                res = [k for k in test_stories[q_i][0][i] if q_array[qsi] in k]
                check.append(len(res))
            #print(check)
            #print(sum(check))
            if(sum(check)>0):
                results.append(test_stories[q_i][0][i])
        qa_set_i= (results,test_stories[q_i][1],test_stories[q_i][2])
        qa_sets_filtered_test.append(qa_set_i)
        
    #lambda func to flatten the list of sentences into one list
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    #creating list of tuples for each story
    data_filtered_test = [(flatten(story), q, answer) for story, q, answer in qa_sets_filtered_test]
    
    vocab = set()
    for story, q, answer in (data_filtered + data_filtered_test):
        vocab |= set(story + q + [answer])

    vocab = sorted(vocab)
    vocab_size = len(vocab) + 1
    story_maxlen = max(map(len, (x for x, _, _ in data_filtered + data_filtered_test)))
    query_maxlen = max(map(len, (x for _, x, _ in data_filtered + data_filtered_test)))
    vocab = list(vocab)
    vocab = ['<pad>'] + vocab
    word_idx = dict((c, i) for i, c in enumerate(vocab))
    idx_word = dict((i, c) for i,c in enumerate(vocab))

    inputs_train, queries_train, answers_train = vectorize_stories(data_filtered,
                                                                   word_idx,
                                                                   story_maxlen,
                                                                   query_maxlen)

    inputs_test, queries_test, answers_test = vectorize_stories(data_filtered_test,
                                                                word_idx,
                                                                story_maxlen,
                                                                query_maxlen)
    
    features_train=np.column_stack((queries_train,inputs_train))
    target_train=np.where(answers_train == 1)[1].reshape(answers_train.shape[0],1)
    
    features_test=np.column_stack((queries_test,inputs_test))
    target_test=np.where(answers_test == 1)[1].reshape(answers_test.shape[0],1)
    
    model = LinearSVC(random_state=60, tol=1e-5)
    model.fit(features_train, target_train)
    predictions = model.predict(features_test)
    #print(len(np.where(answers_test == 1)[1]))
    accuracies_bigram.append(round((accuracy_score(np.where(answers_test == 1)[1], predictions))*100,3))
    print("task "+str(task[f_i])+" "+str((accuracy_score(np.where(answers_test == 1)[1], predictions))*100)+"%")
    

task 1 40.1%
task 2 19.6%
task 3 33.6%
task 4 17.5%
task 5 29.099999999999998%
task 6 50.0%
task 7 62.8%
task 8 44.9%
task 9 61.0%
task 10 81.8%
task 11 17.299999999999997%
task 12 31.3%
task 13 16.8%
task 14 16.400000000000002%
task 15 27.750000000000004%
task 16 28.000000000000004%
task 17 49.125%
task 18 58.139534883720934%
task 19 8.0%
task 20 75.58333333333334%


In [173]:
resuts_ngram_df = pd.DataFrame()
resuts_ngram_df['Task']= [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
resuts_ngram_df['Unigram_accuracy'] = accuracies_unigram
resuts_ngram_df['Bigram_accuracy'] = accuracies_bigram

In [181]:
resuts_ngram_df['Final']=np.where(resuts_ngram_df['Unigram_accuracy'] <= resuts_ngram_df['Bigram_accuracy'], resuts_ngram_df['Bigram_accuracy'], resuts_ngram_df['Unigram_accuracy'])

In [182]:
resuts_ngram_df

Unnamed: 0,Task,Unigram_accuracy,Bigram_accuracy,Final
0,1,41.5,40.1,41.5
1,2,28.7,19.6,28.7
2,3,30.8,33.6,33.6
3,4,20.5,17.5,20.5
4,5,33.9,29.1,33.9
5,6,52.3,50.0,52.3
6,7,65.0,62.8,65.0
7,8,43.5,44.9,44.9
8,9,53.2,61.0,61.0
9,10,69.9,81.8,81.8
