<a href="https://colab.research.google.com/github/rezaafra/question-answering/blob/master/CoQA_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import json
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import urllib.request

from urllib.parse import urlparse

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [2]:

def _get_filename_from_url(url):
    parse = urlparse(url)
    return os.path.basename(parse.path)

def download_file(url, directory, filename=None, extension=None):
    if filename is None:
        filename = _get_filename_from_url(url)

    directory = str(directory)
    filepath = os.path.join(directory, filename)

    if not os.path.isdir(directory):
        os.makedirs(directory)

    # Download
    urllib.request.urlretrieve(url, filename=filepath)

    return filepath

In [3]:
 def coqa_dataset(directory='data/',
                  train=False,
                  dev=False,
                  train_filename='coqa-train-v1.0.json',
                  dev_filename='coqa-dev-v1.0.json',
                  url_train='https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json',
                  url_dev='https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json'):
    
    download_file(url=url_dev, directory=directory)
    download_file(url=url_train, directory=directory)

    ret = []
    splits = [(train, train_filename), (dev, dev_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, filename)
        with open(full_path, 'r') as temp:
            ret.append(json.load(temp)['data'])

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)

# New Section

In [4]:
train, dev = coqa_dataset(train=True), coqa_dataset(dev=True)

In [5]:
train[0]["questions"]

[{'input_text': 'When was the Vat formally opened?', 'turn_id': 1},
 {'input_text': 'what is the library for?', 'turn_id': 2},
 {'input_text': 'for what subjects?', 'turn_id': 3},
 {'input_text': 'and?', 'turn_id': 4},
 {'input_text': 'what was started in 2014?', 'turn_id': 5},
 {'input_text': 'how do scholars divide the library?', 'turn_id': 6},
 {'input_text': 'how many?', 'turn_id': 7},
 {'input_text': 'what is the official name of the Vat?', 'turn_id': 8},
 {'input_text': 'where is it?', 'turn_id': 9},
 {'input_text': 'how many printed books does it contain?', 'turn_id': 10},
 {'input_text': 'when were the Secret Archives moved from the rest of the library?',
  'turn_id': 11},
 {'input_text': 'how many items are in this secret collection?',
  'turn_id': 12},
 {'input_text': 'Can anyone use this library?', 'turn_id': 13},
 {'input_text': 'what must be requested to view?', 'turn_id': 14},
 {'input_text': 'what must be requested in person or by mail?', 'turn_id': 15},
 {'input_text': 

In [27]:
def extractSentencePairs(data):
  qa_pairs = []
  for each in data:
    questions = each["questions"] # a list of dictionaries of questions. exp: {'input_text': 'When was the Vat formally opened?', 'turn_id': 1} 
    answers = each["answers"]
    for q, a in zip(questions, answers):
      qa_pairs.append([q["input_text"], a["input_text"]])
  return qa_pairs

In [34]:
pairs = extracSentencePairs(train)

In [30]:
# Define path to new file
datafile = os.path.join("data", "formatted_corpus.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in extractSentencePairs(train):
        writer.writerow(pair)


Writing newly formatted file...


In [31]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [32]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [39]:
voc = Voc("formatted_corpus.txt")
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])

In [40]:
print(voc.num_words)

71875
