<a href="https://colab.research.google.com/github/shivammehta007/QuestionGenerator/blob/master/CustomSeq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import numpy as np
import logging
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import urllib.request as req
from tqdm import tqdm
import time
import math
import torchtext.data as data
from torchtext.datasets import TranslationDataset
from torchtext.data import Field, BucketIterator

# Configurations

## data.py

In [0]:
DATA_FOLDER = "data"
DATA_FOLDER_RAW = "raw"
DATA_FOLDER_PROCESSED = "processed"

RAW_FILENAMES = {
    "SQUAD": {
        "train": "squad_train.json",
        "test": "squad_test.json",
        "valid": "squad_valid.json",
    }
}

SQUAD_NAME = "SQUAD"

DATASETS = {
    "SQUAD": {
        "train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json",
        "test": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json",
        "valid": None,
    }
}

## root.py

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


LOGGING_LEVEL = logging.DEBUG
LOGGING_FORMAT = (
    "[%(levelname)s | %(filename)s:%(lineno)s - %(funcName)20s() ] %(message)s"
)

logger = logging.getLogger(__name__)
logging.basicConfig(level=LOGGING_LEVEL, format=LOGGING_FORMAT)


def seed_all(seed=1234):
    """Seed the results for duplication"""
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_all()
models = (1, "VanillaSeq2Seq")


## hyperparameter.py

In [0]:
VANILLA_SEQ2SEQ = {
    "INPUT_DIM": 45000,
    "OUTPUT_DIM": 28000,
    "DEC_EMB_DIM": 300,
    "ENC_EMB_DIM": 300,
    "HID_DIM": 600,
    "N_LAYERS": 2,
    "DROPOUT": 0.7,
}

## Utils.py

In [0]:
from spacy.lang.en import English


nlp_word = English()
nlp_sentence = English()
nlp_sentence.add_pipe(nlp_sentence.create_pipe("sentencizer"))


def word_tokenizer(sentence):
    return [word.text for word in nlp_word(sentence)]

# Download Dataset

In [0]:
def download_dataset(dataset_name):
    """
    Downloads the dataset which is passed as parameter
    Input:
    dataset_name: string
    Returns: None
    """

    logger.info("Downloading {}".format(dataset_name))
    output_path = os.path.join(DATA_FOLDER, DATA_FOLDER_RAW)

    if not os.path.exists(output_path):
        logger.debug("Folders doesn't exists creating it")
        os.makedirs(output_path)

    dataset_name = dataset_name.upper()
    if DATASETS[dataset_name]["train"]:
        train_filename, _ = req.urlretrieve(
            url=DATASETS[dataset_name]["train"],
            filename=os.path.join(output_path, RAW_FILENAMES[dataset_name]["train"]),
        )
        logger.debug("Downloaded Train set -> {}".format(train_filename))

    if DATASETS[dataset_name]["test"]:
        test_filename, _ = req.urlretrieve(
            url=DATASETS[dataset_name]["test"],
            filename=os.path.join(output_path, RAW_FILENAMES[dataset_name]["test"]),
        )
        logger.debug("Downloaded Test set -> {}".format(test_filename))

    if DATASETS[dataset_name]["valid"]:
        valid_filename, _ = req.urlretrieve(
            url=DATASETS[dataset_name]["valid"],
            filename=os.path.join(output_path, RAW_FILENAMES[dataset_name]["valid"]),
        )
        logger.debug("Downloaded Valid Set -> {}".format(valid_filename))

    logger.info("Files Downloaded Successfully!")

In [14]:
download_dataset("SQUAD")

[INFO | <ipython-input-13-37e6b7cd460b>:9 -     download_dataset() ] Downloading SQUAD
[DEBUG | <ipython-input-13-37e6b7cd460b>:22 -     download_dataset() ] Downloaded Train set -> data/raw/squad_train.json
[DEBUG | <ipython-input-13-37e6b7cd460b>:29 -     download_dataset() ] Downloaded Test set -> data/raw/squad_test.json
[INFO | <ipython-input-13-37e6b7cd460b>:38 -     download_dataset() ] Files Downloaded Successfully!


# PreProcess

In [0]:
import json

INPUT_PATH = os.path.join(DATA_FOLDER, DATA_FOLDER_RAW)
OUTPUT_PATH = os.path.join(DATA_FOLDER, DATA_FOLDER_PROCESSED)


def convert_to_file_without_answers(
    dataset, dataset_type="train", get_impossible=False
):
    """
    Takes an input json and generates dataset_type.paragraphs and dataset_type.questions
    Input:
    dataset : string -> Name of json input
    dataset_type: string -> Type of dataset like (Train, test, valid)
    get_impossible: boolean -> Flag to get unanswerable questions
    """
    if not os.path.exists(os.path.join(OUTPUT_PATH, SQUAD_NAME)):
        os.makedirs(os.path.join(OUTPUT_PATH, SQUAD_NAME))

    para_output = open(
        os.path.join(OUTPUT_PATH, SQUAD_NAME, dataset_type + ".paragraphs"), "w"
    )
    question_output = open(
        os.path.join(OUTPUT_PATH, SQUAD_NAME, dataset_type + ".questions"), "w"
    )
    dataset = dataset["data"]
    dataset_size = []
    for paragraphs in tqdm(dataset):
        paragraphs = paragraphs["paragraphs"]
        for i, paragraph in enumerate(paragraphs):
            para = paragraph["context"]
            for questionanswers in paragraph["qas"]:
                if questionanswers["is_impossible"]:
                    continue
                question = questionanswers["question"]
                para = para.replace("\n", " ")
                para_output.write(para.strip().lower() + "\n")
                question_output.write(question.strip().lower() + "\n")
                dataset_size.append(i)
    logger.info("Size of the {} dataset: {}".format(dataset_type, len(dataset_size)))
    para_output.close()
    question_output.close()


def split_train_valid(dataset_name, split_ratio=0.9):
    """
    Splits the train set to a validation set
    creates files in the processed folder with 
    """
    logger.debug(
        "Splitting the {}'s train set into train and valid".format(dataset_name)
    )
    if not os.path.exists(os.path.join(OUTPUT_PATH, dataset_name)):
        raise NotImplementedError(
            "The Dataset has not been preprocessed yet please call the \
                 processing method before spliting the trainset"
        )

    filename_paragraph = os.path.join(OUTPUT_PATH, dataset_name, "train.paragraphs")
    filename_questions = os.path.join(OUTPUT_PATH, dataset_name, "train.questions")

    with open(filename_paragraph) as paragraphs_file, open(
        filename_questions
    ) as questions_file:
        data_paragraphs = paragraphs_file.readlines()
        data_questions = questions_file.readlines()

    logger.debug(
        "# of Paragraphs: {} # of Questions: {} ".format(
            len(data_paragraphs), len(data_questions)
        )
    )

    assert len(data_paragraphs) == len(
        data_questions
    ), "Number of Paragraphs and Questions mismatch"

    # Output files
    train_paragraphs_file = open(
        os.path.join(OUTPUT_PATH, dataset_name, "train.paragraphs"), "w"
    )
    valid_paragraphs_file = open(
        os.path.join(OUTPUT_PATH, dataset_name, "valid.paragraphs"), "w"
    )
    train_questions_file = open(
        os.path.join(OUTPUT_PATH, dataset_name, "train.questions"), "w"
    )
    valid_questions_file = open(
        os.path.join(OUTPUT_PATH, dataset_name, "valid.questions"), "w"
    )

    train_count, valid_count = 0, 0

    for i in tqdm(range(len(data_paragraphs))):
        if random.random() < split_ratio:
            train_paragraphs_file.write(data_paragraphs[i].strip() + "\n")
            train_questions_file.write(data_questions[i].strip() + "\n")
            train_count += 1
        else:
            valid_paragraphs_file.write(data_paragraphs[i].strip() + "\n")
            valid_questions_file.write(data_questions[i].strip() + "\n")
            valid_count += 1

    logger.info(
        "Total Trainset: {} | Total ValidSet: {}".format(train_count, valid_count)
    )


def load_json(filelocation):
    """
    Takes Filename as input and returns a Json object
    Input:
    filelocation: string
    Returns:
    json_data: json object
    """
    with open(filelocation) as file:
        json_data = json.load(file)

    return json_data


def preprocess_squad(name, mode, filter):
    """
    PreProcesses Squad
    Input:
    name: string -> Name of the dataset
    mode: string -> To replicate sentences based on number of answers or just questions
    """
    logger.debug("PreProcessing SQUAD")
    logger.debug("Loading JSON")
    train_file = load_json(os.path.join(INPUT_PATH, RAW_FILENAMES[name]["train"]))
    test_file = load_json(os.path.join(INPUT_PATH, RAW_FILENAMES[name]["test"]))

    if mode.upper() == "QUESTION" and not filter:
        convert_to_file_without_answers(train_file, "train")
        convert_to_file_without_answers(test_file, "test")
    else:
        filter_sentences_on_answer(train_file, "train")
        filter_sentences_on_answer(test_file, "test")

    logger.debug("Now we will split train set to train and valid set")
    split_train_valid(name)

    logger.info("{} Preprocessed".format(name))


def extract_filtered_sentences(questionanswers, para):
    """
    Method returns filtered sentences from the answers and para for SQUAD
    """
    tokenized_paragraph = nlp_sentence(para)
    sentences = [sent.string for sent in tokenized_paragraph.sents]

    filtered_sentences = set()

    # This iterates over every answer in question
    for answer in questionanswers["answers"]:
        answer_index = answer["answer_start"]
        length = 0

        # find sentence that has answer and filter them
        for sentence in sentences:
            if answer_index <= length + len(sentence):
                filtered_sentences.add(sentence.replace("\n", " ").strip())
                break
            length += len(sentence)

        if not filtered_sentences:
            print("Length : {}".format(length))
            raise Exception("One of the Answers had no sentence please check the data")

    return " ".join(filtered_sentences)


def filter_sentences_on_answer(dataset, dataset_type="train", get_impossible=False):
    """
    Filter the paragraph with only sentences relevant to answer and generates files
    with sentences and questions instead of paragraphs and questions
    Input:
    dataset: string
    dataset_type: string
    get_impossible: boolean
    """
    if not os.path.exists(os.path.join(OUTPUT_PATH, SQUAD_NAME)):
        os.makedirs(os.path.join(OUTPUT_PATH, SQUAD_NAME))

    para_output = open(
        os.path.join(OUTPUT_PATH, SQUAD_NAME, dataset_type + ".paragraphs"), "w"
    )
    question_output = open(
        os.path.join(OUTPUT_PATH, SQUAD_NAME, dataset_type + ".questions"), "w"
    )
    dataset = dataset["data"]
    dataset_size = []

    logger.debug("Starting to filter sentences on answer")

    # This loops iterates over every paragraph
    for paragraphs in tqdm(dataset):
        paragraphs = paragraphs["paragraphs"]
        for i, paragraph in enumerate(paragraphs):
            para = paragraph["context"]
            # This loop iterates over every question in para
            for questionanswers in paragraph["qas"]:
                if questionanswers["is_impossible"]:
                    continue
                question = questionanswers["question"]

                filtered_sentences = extract_filtered_sentences(questionanswers, para)

                para_output.write(filtered_sentences.strip().lower() + "\n")
                question_output.write(question.strip().lower() + "\n")

                dataset_size.append(i)

    logger.info("Size of the {} dataset: {}".format(dataset_type, len(dataset_size)))
    para_output.close()
    question_output.close()

    logger.debug("Sentences Filtered on Answers")

In [19]:
preprocess_squad("SQUAD", "QUESTION", True)

[DEBUG | <ipython-input-18-7fa915c1dd65>:130 -     preprocess_squad() ] PreProcessing SQUAD
[DEBUG | <ipython-input-18-7fa915c1dd65>:131 -     preprocess_squad() ] Loading JSON
[DEBUG | <ipython-input-18-7fa915c1dd65>:197 - filter_sentences_on_answer() ] Starting to filter sentences on answer
100%|██████████| 442/442 [02:37<00:00,  2.81it/s]
[INFO | <ipython-input-18-7fa915c1dd65>:217 - filter_sentences_on_answer() ] Size of the train dataset: 86821
[DEBUG | <ipython-input-18-7fa915c1dd65>:221 - filter_sentences_on_answer() ] Sentences Filtered on Answers
[DEBUG | <ipython-input-18-7fa915c1dd65>:197 - filter_sentences_on_answer() ] Starting to filter sentences on answer
100%|██████████| 35/35 [00:11<00:00,  2.65it/s]
[INFO | <ipython-input-18-7fa915c1dd65>:217 - filter_sentences_on_answer() ] Size of the test dataset: 5928
[DEBUG | <ipython-input-18-7fa915c1dd65>:221 - filter_sentences_on_answer() ] Sentences Filtered on Answers
[DEBUG | <ipython-input-18-7fa915c1dd65>:142 -     prepro

## Data Loader

In [0]:
FILE_PATH = os.path.join(DATA_FOLDER, DATA_FOLDER_PROCESSED)


def load_dataset(
    dataset_name="SQUAD",
    tokenizer=word_tokenizer,
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    use_glove=True,
    source_vocab=45000,
    target_vocab=28000,
    batch_size=256,
):
    """
    Method Loads the dataset from location and returns three iterators and SRC and TRG fields
    """
    logger.debug("Loading {} dataset".format(dataset_name))
    SRC = data.Field(
        tokenize=tokenizer,
        init_token=init_token,
        eos_token=eos_token,
        lower=True,
        include_lengths=True,
    )
    TRG = data.Field(
        tokenize=tokenizer, init_token=init_token, eos_token=eos_token, lower=True
    )

    location = os.path.join(FILE_PATH, dataset_name)

    logger.debug("Loading from location: {}".format(location))
    start_time = time.time()
    train_dataset, valid_dataset, test_dataset = TranslationDataset.splits(
        exts=(".paragraphs", ".questions"),
        fields=(SRC, TRG),
        path=location,
        train="train",
        validation="valid",
        test="test",
    )

    logger.debug(
        "Number of Samples: Training = {} | Validation = {} | Testing = {}".format(
            len(train_dataset.examples),
            len(valid_dataset.examples),
            len(test_dataset.examples),
        )
    )
    logger.debug("Time Taken: {:.6f}s".format(time.time() - start_time))
    logger.debug("Building Vocab")

    start_time = time.time()
    if use_glove:
        logger.debug("Using Glove vectors")
        SRC.build_vocab(train_dataset, max_size=source_vocab, vectors="glove.6B.300d")
        TRG.build_vocab(train_dataset, max_size=target_vocab, vectors="glove.6B.300d")
    else:
        SRC.build_vocab(train_dataset, max_size=source_vocab)
        TRG.build_vocab(train_dataset, max_size=target_vocab)

    logger.info(
        "Vocabulary Built! Source Tokens = {} | Target Tokens = {}  \nCreating Iterators".format(
            len(SRC.vocab), len(TRG.vocab)
        )
    )
    logger.debug("Time Taken: {:.6f}s".format(time.time() - start_time))

    return (
        BucketIterator.splits(
            (train_dataset, valid_dataset, test_dataset),
            batch_size=batch_size,
            sort_within_batch=True,
            sort_key=lambda x: len(x.src),
            device=device,
        ),
        SRC,
        TRG,
    )

## Defining Vanilla Seq2Seq Model

In [0]:
class Encoder(nn.Module):
    """
    A bidirectional GRU Encoder
    Input:
        input_dim: Vocab length of input
        embedding_dim: Dimension of Embeddings
        hidden_dim: Dimension of Hidden vectors of LSTM
        n_layers: Layers of LSTM
        dropout: Dropout applied
    Returns:
        hidden: hidden layers of LSTM
        cell: cell state of LSTM
    """

    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.gru = nn.GRU(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=True,
            dropout=dropout,
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_len):
        # src = [src_len, batch_size]
        # src_len = [batch_size]

        embedded = self.dropout(self.embedding(src))

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len)

        packed_output, hidden = self.gru(packed_embedded)

        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        return hidden


class Decoder(nn.Module):
    """
    A Decoder GRU Decoder
    Input:
        output_dim: Vocab length of the output
        embedding_dim: Decoder Embedding Dimension
        hidden_dim: Hidden Dimensions of the GRU Layer
        n_layer: Number of layer for GRU
        dropout: Dropout Applied
    Output:
        prediction: Output of the Fully connected layer
    """

    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.gru = nn.GRU(
            embedding_dim, 2 * hidden_dim, num_layers=n_layers, dropout=dropout
        )
        self.fc_out = nn.Linear(2 * hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded, (hidden))
        output = self.dropout(self.fc_out(output))
        prediction = output.squeeze(0)
        return prediction, hidden


class VanillaSeq2Seq(nn.Module):
    """
    Final EncoderDecoderModel
    """

    def __init__(self, encoder, decoder, device):
        super(VanillaSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_len, trg, teacher_forcing=0.5):
        encoder_hidden = self.encoder(src, src_len)

        encoder_hidden = encoder_hidden.view(encoder_hidden.shape[0]//2, encoder_hidden.shape[1], encoder_hidden.shape[2] * 2)

        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # Take first letter of the input
        input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden = self.decoder(input, encoder_hidden)

            outputs[t] = output

            teacher_forcing = random.random() < teacher_forcing

            if teacher_forcing:
                input = trg[t]
            else:
                input = torch.argmax(output, dim=1)

        return outputs

## Training Vanilla Seq2Seq Model

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


def train(model, iterator, optimizer, criterion, clip):
    """
    Generic Training Method
    """

    model.train()

    epoch_loss = 0

    for i, batch in tqdm(enumerate(iterator), total=len(iterator)):

        src, src_len = batch.src
        trg = batch.trg

        optimizer.zero_grad()
        output = model(src, src_len, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.detach().item()

        del output
        del loss


    if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return epoch_loss / len(iterator)


def evaluate(model, iterator, criterion):
    """
    Generic Evaluation Method
    """

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for i, batch in enumerate(iterator):

            src, src_len = batch.src
            trg = batch.trg

            output = model(src, src_len, trg, 0)
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.detach().item()

    return epoch_loss / len(iterator)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def train_vanilla_seq2seq(
    dataset_name="SQUAD",
    clip=1,
    lr=0.001,
    validation=True,
    epochs=5,
    teacher_forcing=0.0,
):
    """
    Method to train the Vanilla Seq2Seq
    """

    logger.debug("Data Loading")

    (train_iterator, valid_iterator, test_iterator), SRC, TRG = load_dataset(
        dataset_name,
        source_vocab=VANILLA_SEQ2SEQ["INPUT_DIM"],
        target_vocab=VANILLA_SEQ2SEQ["OUTPUT_DIM"],
    )

    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)

    logger.debug("Initializing Models on {}".format(device))
    enc = Encoder(
        INPUT_DIM,
        VANILLA_SEQ2SEQ["ENC_EMB_DIM"],
        VANILLA_SEQ2SEQ["HID_DIM"],
        VANILLA_SEQ2SEQ["N_LAYERS"],
        VANILLA_SEQ2SEQ["DROPOUT"],
    )
    dec = Decoder(
        OUTPUT_DIM,
        VANILLA_SEQ2SEQ["DEC_EMB_DIM"],
        VANILLA_SEQ2SEQ["HID_DIM"],
        VANILLA_SEQ2SEQ["N_LAYERS"],
        VANILLA_SEQ2SEQ["DROPOUT"],
    )

    model = VanillaSeq2Seq(enc, dec, device).to(device)

    logger.info(
        "The model has {:,} trainable parameters".format(count_parameters(model))
    )

    logger.debug(model)

    optimizer = optim.Adam(model.parameters())

    TRG_PADDING = TRG.vocab.stoi[TRG.pad_token]

    criterion = nn.CrossEntropyLoss(ignore_index=TRG_PADDING)

    best_valid_loss = float("inf")

    for epoch in range(epochs):
        start_time = time.time()

        train_loss = train(model, train_iterator, optimizer, criterion, clip)
        valid_loss = evaluate(model, valid_iterator, criterion)

        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), "trained_model.pt")

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        logger.info(
            "Epoch: {:02} | Time: {}m {}s".format(epoch + 1, epoch_mins, epoch_secs)
        )
        logger.info(
            "\tTrain Loss: {:.3f} | Train PPL: {:7.3f}".format(
                train_loss, math.exp(train_loss)
            )
        )
        logger.info(
            "\t Val. Loss: {:.3f} |  Val. PPL: {:7.3f}".format(
                valid_loss, math.exp(valid_loss)
            )
        )

In [9]:
train_vanilla_seq2seq()

[DEBUG | <ipython-input-8-d1ec07b35905>:91 - train_vanilla_seq2seq() ] Data Loading
[DEBUG | <ipython-input-6-796282a7e7a5>:18 -         load_dataset() ] Loading SQUAD dataset
[DEBUG | <ipython-input-6-796282a7e7a5>:32 -         load_dataset() ] Loading from location: data/processed/SQUAD
[DEBUG | <ipython-input-6-796282a7e7a5>:47 -         load_dataset() ] Number of Samples: Training = 78120 | Validation = 8701 | Testing = 5928
[DEBUG | <ipython-input-6-796282a7e7a5>:50 -         load_dataset() ] Time Taken: 102.617422s
[DEBUG | <ipython-input-6-796282a7e7a5>:51 -         load_dataset() ] Building Vocab
[DEBUG | <ipython-input-6-796282a7e7a5>:55 -         load_dataset() ] Using Glove vectors
[INFO | vocab.py:386 -                cache() ] Loading vectors from .vector_cache/glove.6B.300d.txt.pt
[INFO | vocab.py:386 -                cache() ] Loading vectors from .vector_cache/glove.6B.300d.txt.pt
[INFO | <ipython-input-6-796282a7e7a5>:64 -         load_dataset() ] Vocabulary Built! Sou

In [11]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 11.8 GB  | Proc size: 4.4 GB
GPU RAM Free: 9198MB | Used: 2243MB | Util  20% | Total 11441MB


In [0]:
def evaluate_test_set(dataset_name="SQUAD"):
    (train_iterator, valid_iterator, test_iterator), SRC, TRG = load_dataset(
        dataset_name,
        source_vocab=VANILLA_SEQ2SEQ["INPUT_DIM"],
        target_vocab=VANILLA_SEQ2SEQ["OUTPUT_DIM"],
    )

    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)

    logger.debug("Initializing Models on {}".format(device))
    enc = Encoder(
        INPUT_DIM,
        VANILLA_SEQ2SEQ["ENC_EMB_DIM"],
        VANILLA_SEQ2SEQ["HID_DIM"],
        VANILLA_SEQ2SEQ["N_LAYERS"],
        VANILLA_SEQ2SEQ["DROPOUT"],
    )
    dec = Decoder(
        OUTPUT_DIM,
        VANILLA_SEQ2SEQ["DEC_EMB_DIM"],
        VANILLA_SEQ2SEQ["HID_DIM"],
        VANILLA_SEQ2SEQ["N_LAYERS"],
        VANILLA_SEQ2SEQ["DROPOUT"],
    )
    model = VanillaSeq2Seq(enc, dec, device).to(device)
    model.load_state_dict(torch.load('trained_model.pt'))

    TRG_PADDING = TRG.vocab.stoi[TRG.pad_token]

    criterion = nn.CrossEntropyLoss(ignore_index=TRG_PADDING)

    test_loss = evaluate(model, test_iterator, criterion)

    logger.info(
            "Test Loss: {:.3f} | Train PPL: {:7.3f}".format(
                test_loss, math.exp(test_loss)
            )
    )


In [22]:
evaluate_test_set()

[DEBUG | <ipython-input-6-796282a7e7a5>:18 -         load_dataset() ] Loading SQUAD dataset
[DEBUG | <ipython-input-6-796282a7e7a5>:32 -         load_dataset() ] Loading from location: data/processed/SQUAD
[DEBUG | <ipython-input-6-796282a7e7a5>:47 -         load_dataset() ] Number of Samples: Training = 78120 | Validation = 8701 | Testing = 5928
[DEBUG | <ipython-input-6-796282a7e7a5>:50 -         load_dataset() ] Time Taken: 100.966801s
[DEBUG | <ipython-input-6-796282a7e7a5>:51 -         load_dataset() ] Building Vocab
[DEBUG | <ipython-input-6-796282a7e7a5>:55 -         load_dataset() ] Using Glove vectors
[INFO | vocab.py:386 -                cache() ] Loading vectors from .vector_cache/glove.6B.300d.txt.pt
[INFO | vocab.py:386 -                cache() ] Loading vectors from .vector_cache/glove.6B.300d.txt.pt
[INFO | <ipython-input-6-796282a7e7a5>:64 -         load_dataset() ] Vocabulary Built! Source Tokens = 45004 | Target Tokens = 28004  
Creating Iterators
[DEBUG | <ipython-in