In [24]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
from argparse import ArgumentParser


unk = '<UNK>'
# Consult the PyTorch documentation for information on the functions used below:
# https://pytorch.org/docs/stable/torch.html
class FFNN(nn.Module):
    def __init__(self, input_dim, h):
        super(FFNN, self).__init__()
        self.h = h
        self.W1 = nn.Linear(input_dim, h)
        self.activation = nn.ReLU() # The rectified linear unit; one valid choice of activation function
        self.output_dim = 5
        self.W2 = nn.Linear(h, self.output_dim)

        self.softmax = nn.LogSoftmax() # The softmax function that converts vectors into probability distributions; computes log probabilities for computational benefits
        self.loss = nn.NLLLoss() # The cross-entropy/negative log likelihood loss taught in class

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, input_vector):
        # [to fill] obtain first hidden layer representation
        hidden_layer = self.activation(self.W1(input_vector))
        # [to fill] obtain output layer representation
        output_layer = self.W2(hidden_layer)
        # [to fill] obtain probability dist.
        predicted_vector = self.softmax(output_layer)
        # return predicted probability distribution
        return predicted_vector


# Returns:
# vocab = A set of strings corresponding to the vocabulary
def make_vocab(data):
    vocab = set()
    for document, _ in data:
        for word in document:
            vocab.add(word)
    return vocab


# Returns:
# vocab = A set of strings corresponding to the vocabulary including <UNK>
# word2index = A dictionary mapping word/token to its index (a number in 0, ..., V - 1)
# index2word = A dictionary inverting the mapping of word2index
def make_indices(vocab):
    vocab_list = sorted(vocab)
    vocab_list.append(unk)
    word2index = {}
    index2word = {}
    for index, word in enumerate(vocab_list):
        word2index[word] = index
        index2word[index] = word
    vocab.add(unk)
    return vocab, word2index, index2word


# Returns:
# vectorized_data = A list of pairs (vector representation of input, y)
def convert_to_vector_representation(data, word2index):
    vectorized_data = []
    for document, y in data:
        vector = torch.zeros(len(word2index))
        for word in document:
            index = word2index.get(word, word2index[unk])
            vector[index] += 1
        vectorized_data.append((vector, y))
    return vectorized_data



def load_data(train_data, val_data):
    with open(train_data) as training_f:
        training = json.load(training_f)
    with open(val_data) as valid_f:
        validation = json.load(valid_f)

    tra = []
    val = []
    for elt in training:
        tra.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in validation:
        val.append((elt["text"].split(),int(elt["stars"]-1)))

    return tra, val


if __name__ == "__main__":
    parser = ArgumentParser()
    # parser.add_argument("-hd", "--hidden_dim", type=int, default=50, required = True, help = "hidden_dim")
    # parser.add_argument("-e", "--epochs", type=int, default=10, required = True, help = "num of epochs to train")
    # parser.add_argument("--train_data", required = True, default="training.json", help = "path to training data")
    # parser.add_argument("--val_data", required = True, default="validation.json", help = "path to validation data")
    hidden_dim = 50
    epochs = 10
    train_data_file = "training.json"
    val_data_file = "validation.json"
    output_file = f"results_rnn_hidden{hidden_dim}.json"
    parser.add_argument("--test_data", default = "to fill", help = "path to test data")
    parser.add_argument('--do_train', action='store_true')
    args = parser.parse_args(args=[])

    args.hidden_dim = hidden_dim
    args.epochs = epochs
    args.train_data = train_data_file
    args.val_data = val_data_file

    # fix random seeds
    random.seed(42)
    torch.manual_seed(42)

    #Hidden Dimension
    print("Hidden Dimension: {}".format(args.hidden_dim))

    # load data
    print("========== Loading data ==========")
    train_data, valid_data = load_data(args.train_data, args.val_data) # X_data is a list of pairs (document, y); y in {0,1,2,3,4}
    vocab = make_vocab(train_data)
    vocab, word2index, index2word = make_indices(vocab)

    print("========== Vectorizing data ==========")
    train_data = convert_to_vector_representation(train_data, word2index)
    valid_data = convert_to_vector_representation(valid_data, word2index)


    model = FFNN(input_dim = len(vocab), h = args.hidden_dim)
    optimizer = optim.SGD(model.parameters(),lr=0.01, momentum=0.9)
    results = {"train_acc": [], "val_acc": [], "train_loss": [], "val_loss": [], "train_time": [], "val_time": []}
    print("========== Training for {} epochs ==========".format(args.epochs))
    for epoch in range(args.epochs):
        model.train()
        optimizer.zero_grad()
        loss = None
        correct = 0
        total = 0
        start_time = time.time()
        print("Training started for epoch {}".format(epoch + 1))
        random.shuffle(train_data) # Good practice to shuffle order of training data
        minibatch_size = 16
        N = len(train_data)
        for minibatch_index in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = None
            for example_index in range(minibatch_size):
                input_vector, gold_label = train_data[minibatch_index * minibatch_size + example_index]
                predicted_vector = model(input_vector)
                predicted_label = torch.argmax(predicted_vector)
                correct += int(predicted_label == gold_label)
                total += 1
                example_loss = model.compute_Loss(predicted_vector.view(1,-1), torch.tensor([gold_label]))
                if loss is None:
                    loss = example_loss
                else:
                    loss += example_loss
            loss = loss / minibatch_size
            loss.backward()
            optimizer.step()
        print("Training completed for epoch {}".format(epoch + 1))
        print("Training accuracy for epoch {}: {}".format(epoch + 1, correct / total))
        print("Training time for this epoch: {}".format(time.time() - start_time))
        results["train_acc"].append(float(correct) / float(total))
        results["train_loss"].append(float(loss.item()))  # convert tensor to float
        results["train_time"].append(float(time.time() - start_time))


        loss = None
        correct = 0
        total = 0
        start_time = time.time()
        print("Validation started for epoch {}".format(epoch + 1))
        minibatch_size = 16
        N = len(valid_data)
        for minibatch_index in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = None
            for example_index in range(minibatch_size):
                input_vector, gold_label = valid_data[minibatch_index * minibatch_size + example_index]
                predicted_vector = model(input_vector)
                predicted_label = torch.argmax(predicted_vector)
                correct += int(predicted_label == gold_label)
                total += 1
                example_loss = model.compute_Loss(predicted_vector.view(1,-1), torch.tensor([gold_label]))
                if loss is None:
                    loss = example_loss
                else:
                    loss += example_loss
            loss = loss / minibatch_size
        print("Validation completed for epoch {}".format(epoch + 1))
        print("Validation accuracy for epoch {}: {}".format(epoch + 1, correct / total))
        print("Validation time for this epoch: {}".format(time.time() - start_time))
        results["val_acc"].append(float(correct) / float(total))
        results["val_loss"].append(float(loss.item()))  # convert tensor to Python float
        results["val_time"].append(float(time.time() - start_time))


        with open(output_file, "w") as f:
          json.dump(results, f, indent=2)

        print(f"Results saved to {output_file}")

    # write out to results/test.out


Hidden Dimension: 50
Training started for epoch 1


100%|██████████| 500/500 [00:31<00:00, 15.73it/s]


Training completed for epoch 1
Training accuracy for epoch 1: 0.527
Training time for this epoch: 31.78461742401123
Validation started for epoch 1


100%|██████████| 50/50 [00:00<00:00, 73.46it/s]


Validation completed for epoch 1
Validation accuracy for epoch 1: 0.535
Validation time for this epoch: 0.6853542327880859
Results saved to results_rnn_hidden50.json
Training started for epoch 2


100%|██████████| 500/500 [00:33<00:00, 14.74it/s]


Training completed for epoch 2
Training accuracy for epoch 2: 0.585625
Training time for this epoch: 33.916747093200684
Validation started for epoch 2


100%|██████████| 50/50 [00:00<00:00, 73.59it/s]


Validation completed for epoch 2
Validation accuracy for epoch 2: 0.59375
Validation time for this epoch: 0.6837759017944336
Results saved to results_rnn_hidden50.json
Training started for epoch 3


100%|██████████| 500/500 [00:41<00:00, 12.09it/s]


Training completed for epoch 3
Training accuracy for epoch 3: 0.6225
Training time for this epoch: 41.35323095321655
Validation started for epoch 3


100%|██████████| 50/50 [00:00<00:00, 71.26it/s]


Validation completed for epoch 3
Validation accuracy for epoch 3: 0.59625
Validation time for this epoch: 0.7080729007720947
Results saved to results_rnn_hidden50.json
Training started for epoch 4


100%|██████████| 500/500 [00:41<00:00, 12.02it/s]


Training completed for epoch 4
Training accuracy for epoch 4: 0.64525
Training time for this epoch: 41.61332058906555
Validation started for epoch 4


100%|██████████| 50/50 [00:00<00:00, 60.41it/s]


Validation completed for epoch 4
Validation accuracy for epoch 4: 0.59625
Validation time for this epoch: 0.8300416469573975
Results saved to results_rnn_hidden50.json
Training started for epoch 5


100%|██████████| 500/500 [00:44<00:00, 11.31it/s]


Training completed for epoch 5
Training accuracy for epoch 5: 0.651
Training time for this epoch: 44.21337389945984
Validation started for epoch 5


100%|██████████| 50/50 [00:00<00:00, 69.77it/s]


Validation completed for epoch 5
Validation accuracy for epoch 5: 0.5975
Validation time for this epoch: 0.7204208374023438
Results saved to results_rnn_hidden50.json
Training started for epoch 6


100%|██████████| 500/500 [00:43<00:00, 11.49it/s]


Training completed for epoch 6
Training accuracy for epoch 6: 0.684375
Training time for this epoch: 43.51960206031799
Validation started for epoch 6


100%|██████████| 50/50 [00:00<00:00, 71.16it/s]


Validation completed for epoch 6
Validation accuracy for epoch 6: 0.61
Validation time for this epoch: 0.7069120407104492
Results saved to results_rnn_hidden50.json
Training started for epoch 7


100%|██████████| 500/500 [00:43<00:00, 11.49it/s]


Training completed for epoch 7
Training accuracy for epoch 7: 0.718375
Training time for this epoch: 43.516746520996094
Validation started for epoch 7


100%|██████████| 50/50 [00:00<00:00, 70.51it/s]


Validation completed for epoch 7
Validation accuracy for epoch 7: 0.52625
Validation time for this epoch: 0.7121109962463379
Results saved to results_rnn_hidden50.json
Training started for epoch 8


100%|██████████| 500/500 [00:44<00:00, 11.22it/s]


Training completed for epoch 8
Training accuracy for epoch 8: 0.726125
Training time for this epoch: 44.588974714279175
Validation started for epoch 8


100%|██████████| 50/50 [00:00<00:00, 71.73it/s]


Validation completed for epoch 8
Validation accuracy for epoch 8: 0.5925
Validation time for this epoch: 0.7013015747070312
Results saved to results_rnn_hidden50.json
Training started for epoch 9


100%|██████████| 500/500 [00:44<00:00, 11.26it/s]


Training completed for epoch 9
Training accuracy for epoch 9: 0.752
Training time for this epoch: 44.41434288024902
Validation started for epoch 9


100%|██████████| 50/50 [00:00<00:00, 70.72it/s]


Validation completed for epoch 9
Validation accuracy for epoch 9: 0.6025
Validation time for this epoch: 0.7104687690734863
Results saved to results_rnn_hidden50.json
Training started for epoch 10


100%|██████████| 500/500 [00:44<00:00, 11.34it/s]


Training completed for epoch 10
Training accuracy for epoch 10: 0.78025
Training time for this epoch: 44.09694576263428
Validation started for epoch 10


100%|██████████| 50/50 [00:00<00:00, 65.05it/s]

Validation completed for epoch 10
Validation accuracy for epoch 10: 0.59
Validation time for this epoch: 0.7714321613311768
Results saved to results_rnn_hidden50.json





In [16]:
import os
print(os.getcwd())


/content


In [27]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import init
import torch.optim as optim
import math
import random
import os
import time
from tqdm import tqdm
import json
import string
from argparse import ArgumentParser
import pickle

unk = '<UNK>'
# Consult the PyTorch documentation for information on the functions used below:
# https://pytorch.org/docs/stable/torch.html
class RNN(nn.Module):
    def __init__(self, input_dim, h):  # Add relevant parameters
        super(RNN, self).__init__()
        self.h = h
        self.numOfLayer = 1
        self.rnn = nn.RNN(input_dim, h, self.numOfLayer, nonlinearity='tanh')
        self.W = nn.Linear(h, 5)
        self.softmax = nn.LogSoftmax(dim=1)
        self.loss = nn.NLLLoss()

    def compute_Loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)

    def forward(self, inputs):
         # [to fill] obtain hidden layer representation (https://pytorch.org/docs/stable/generated/torch.nn.RNN.html)
        _, hidden = self.rnn(inputs)
        # [to fill] obtain output layer representations
        predicted_vector = self.W(hidden)
        # [to fill] sum over output
        predicted_vector = torch.sum(predicted_vector, dim=0)
        # [to fill] obtain probability dist.
        predicted_vector = self.softmax(predicted_vector)
        return predicted_vector


def load_data(train_data, val_data):
    with open(train_data) as training_f:
        training = json.load(training_f)
    with open(val_data) as valid_f:
        validation = json.load(valid_f)

    tra = []
    val = []
    for elt in training:
        tra.append((elt["text"].split(),int(elt["stars"]-1)))
    for elt in validation:
        val.append((elt["text"].split(),int(elt["stars"]-1)))
    return tra, val


if __name__ == "__main__":
    parser = ArgumentParser()
    # parser.add_argument("-hd", "--hidden_dim", type=int, default=50, required = True, help = "hidden_dim")
    # parser.add_argument("-e", "--epochs", type=int, default=10, required = True, help = "num of epochs to train")
    # parser.add_argument("--train_data", required = True, default="training.json", help = "path to training data")
    # parser.add_argument("--val_data", required = True, default="validation.json", help = "path to validation data")
    hidden_dim = 100
    epochs = 10
    train_data_file = "training.json"
    val_data_file = "validation.json"
    output_file = f"results_rnn_hidden{hidden_dim}.json"
    parser.add_argument("--test_data", default = "to fill", help = "path to test data")
    parser.add_argument('--do_train', action='store_true')
    args = parser.parse_args(args=[])

    args.hidden_dim = hidden_dim
    args.epochs = epochs
    args.train_data = train_data_file
    args.val_data = val_data_file
    print("========== Loading data ==========")
    train_data, valid_data = load_data(args.train_data, args.val_data) # X_data is a list of pairs (document, y); y in {0,1,2,3,4}

    # Think about the type of function that an RNN describes. To apply it, you will need to convert the text data into vector representations.
    # Further, think about where the vectors will come from. There are 3 reasonable choices:
    # 1) Randomly assign the input to vectors and learn better embeddings during training; see the PyTorch documentation for guidance
    # 2) Assign the input to vectors using pretrained word embeddings. We recommend any of {Word2Vec, GloVe, FastText}. Then, you do not train/update these embeddings.
    # 3) You do the same as 2) but you train (this is called fine-tuning) the pretrained embeddings further.
    # Option 3 will be the most time consuming, so we do not recommend starting with this

    print("========== Vectorizing data ==========")
    model = RNN(50, args.hidden_dim)  # Fill in parameters
    # optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    word_embedding = pickle.load(open('./word_embedding.pkl', 'rb'))

    stopping_condition = False
    epoch = 0

    last_train_accuracy = 0
    last_validation_accuracy = 0

    results = {"train_acc": [], "val_acc": [], "train_loss": [], "val_loss": [], "train_time": [], "val_time": []}
    print("========== Training ==========")
    while not stopping_condition:
        random.shuffle(train_data)
        model.train()
        # You will need further code to operationalize training, ffnn.py may be helpful
        print("Training started for epoch {}".format(epoch + 1))
        train_data = train_data
        correct = 0
        total = 0
        minibatch_size = 16
        N = len(train_data)

        loss_total = 0
        loss_count = 0
        for minibatch_index in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = None
            for example_index in range(minibatch_size):
                input_words, gold_label = train_data[minibatch_index * minibatch_size + example_index]
                input_words = " ".join(input_words)

                # Remove punctuation
                input_words = input_words.translate(input_words.maketrans("", "", string.punctuation)).split()

                # Look up word embedding dictionary
                vectors = [word_embedding[i.lower()] if i.lower() in word_embedding.keys() else word_embedding['unk'] for i in input_words ]

                # Transform the input into required shape
                vectors = torch.tensor(vectors).view(len(vectors), 1, -1)
                output = model(vectors)

                # Get loss
                example_loss = model.compute_Loss(output.view(1,-1), torch.tensor([gold_label]))

                # Get predicted label
                predicted_label = torch.argmax(output)

                correct += int(predicted_label == gold_label)
                # print(predicted_label, gold_label)
                total += 1
                if loss is None:
                    loss = example_loss
                else:
                    loss += example_loss

            loss = loss / minibatch_size
            loss_total += loss.data
            loss_count += 1
            loss.backward()
            optimizer.step()
        print(loss_total/loss_count)
        print("Training completed for epoch {}".format(epoch + 1))
        print("Training accuracy for epoch {}: {}".format(epoch + 1, correct / total))
        trainning_accuracy = correct/total
        results["train_acc"].append(float(correct) / float(total))
        results["train_loss"].append(float(loss.item()))  # convert tensor to float
        results["train_time"].append(float(time.time() - start_time))


        model.eval()
        correct = 0
        total = 0
        random.shuffle(valid_data)
        print("Validation started for epoch {}".format(epoch + 1))
        valid_data = valid_data

        for input_words, gold_label in tqdm(valid_data):
            input_words = " ".join(input_words)
            input_words = input_words.translate(input_words.maketrans("", "", string.punctuation)).split()
            vectors = [word_embedding[i.lower()] if i.lower() in word_embedding.keys() else word_embedding['unk'] for i
                       in input_words]

            vectors = torch.tensor(vectors).view(len(vectors), 1, -1)
            output = model(vectors)
            predicted_label = torch.argmax(output)
            correct += int(predicted_label == gold_label)
            total += 1
            # print(predicted_label, gold_label)
        print("Validation completed for epoch {}".format(epoch + 1))
        print("Validation accuracy for epoch {}: {}".format(epoch + 1, correct / total))
        validation_accuracy = correct/total

        if validation_accuracy < last_validation_accuracy and trainning_accuracy < last_train_accuracy:
            stopping_condition=True
            print("Training done to avoid overfitting!")
            print("Best validation accuracy is:", last_validation_accuracy)
            results["val_acc"].append(float(correct) / float(total))
            results["val_loss"].append(float(loss.item()))  # convert tensor to Python float
            results["val_time"].append(float(time.time() - start_time))
        else:
            last_validation_accuracy = validation_accuracy
            last_train_accuracy = trainning_accuracy
            results["val_acc"].append(float(correct) / float(total))
            results["val_loss"].append(float(loss.item()))  # convert tensor to Python float
            results["val_time"].append(float(time.time() - start_time))
        with open(output_file, "w") as f:
          json.dump(results, f, indent=2)

        epoch += 1



    # You may find it beneficial to keep track of training accuracy or training loss;

    # Think about how to update the model and what this entails. Consider ffnn.py and the PyTorch documentation for guidance


Training started for epoch 1


100%|██████████| 500/500 [02:06<00:00,  3.95it/s]


tensor(1.1247)
Training completed for epoch 1
Training accuracy for epoch 1: 0.40125
Validation started for epoch 1


100%|██████████| 800/800 [00:04<00:00, 179.17it/s]


Validation completed for epoch 1
Validation accuracy for epoch 1: 0.4
Training started for epoch 2


100%|██████████| 500/500 [02:05<00:00,  4.00it/s]


tensor(1.1235)
Training completed for epoch 2
Training accuracy for epoch 2: 0.4045
Validation started for epoch 2


100%|██████████| 800/800 [00:03<00:00, 209.88it/s]


Validation completed for epoch 2
Validation accuracy for epoch 2: 0.4225
Training started for epoch 3


100%|██████████| 500/500 [02:04<00:00,  4.02it/s]


tensor(1.1122)
Training completed for epoch 3
Training accuracy for epoch 3: 0.41
Validation started for epoch 3


100%|██████████| 800/800 [00:04<00:00, 169.67it/s]


Validation completed for epoch 3
Validation accuracy for epoch 3: 0.4075
Training started for epoch 4


100%|██████████| 500/500 [01:58<00:00,  4.23it/s]


tensor(1.0988)
Training completed for epoch 4
Training accuracy for epoch 4: 0.412125
Validation started for epoch 4


100%|██████████| 800/800 [00:04<00:00, 170.79it/s]


Validation completed for epoch 4
Validation accuracy for epoch 4: 0.43
Training started for epoch 5


100%|██████████| 500/500 [01:55<00:00,  4.32it/s]


tensor(1.1095)
Training completed for epoch 5
Training accuracy for epoch 5: 0.4125
Validation started for epoch 5


100%|██████████| 800/800 [00:04<00:00, 171.27it/s]


Validation completed for epoch 5
Validation accuracy for epoch 5: 0.39875
Training started for epoch 6


100%|██████████| 500/500 [01:56<00:00,  4.28it/s]


tensor(1.1025)
Training completed for epoch 6
Training accuracy for epoch 6: 0.4175
Validation started for epoch 6


100%|██████████| 800/800 [00:03<00:00, 204.36it/s]


Validation completed for epoch 6
Validation accuracy for epoch 6: 0.415
Training started for epoch 7


100%|██████████| 500/500 [01:57<00:00,  4.26it/s]


tensor(1.1158)
Training completed for epoch 7
Training accuracy for epoch 7: 0.408125
Validation started for epoch 7


100%|██████████| 800/800 [00:04<00:00, 196.11it/s]

Validation completed for epoch 7
Validation accuracy for epoch 7: 0.41
Training done to avoid overfitting!
Best validation accuracy is: 0.415



