In [1]:
import json
from lstm_baseline import LSTMBaseline
import torch
import string

*Parse the sentences from the jsonl file than split into a training set and a test set*

In [2]:
sentence_data = []
translator = str.maketrans('', '', string.punctuation)
with open('multinli_1.0_train.jsonl', 'r') as jsonl:
    line = jsonl.readline()
    while line is not None and line != "":
        json_line = json.loads(line)
        sentence1 = json_line['sentence1'].translate(translator).split()
        sentence2 = json_line['sentence2'].translate(translator).split()
        words1 = [word.lower() for word in sentence1 if word != "-"]
        words2 = [word.lower() for word in sentence2 if word != "-"]
        sentence_data.append((words1, words2, json_line['gold_label']))
        line = jsonl.readline()

In [3]:
training_data = sentence_data[:500]
test_data = sentence_data[500:2000]

In [4]:
tag_to_ix = {}
words = []
for sent1, sent2, tag in sentence_data:
    if tag not in tag_to_ix:
        tag_to_ix[tag] = len(tag_to_ix)
    words += sent1
    words += sent2
words = set(words)
print("Number of unique words in the dataset:", len(words))

Number of unique words in the dataset: 88022


*Test function to check the accuracy of the classification for each class*

In [5]:
def test(model):
    classes = list(tag_to_ix.keys())
    class_tp = list(0 for _ in range(len(classes)))
    class_fp = list(0 for _ in range(len(classes)))
    class_fn = list(0 for _ in range(len(classes)))
    for sentence_hypothesis, sentence_premise, tag in test_data:
        sentence_h = model.prepare_sequence(sentence_hypothesis)
        sentence_p = model.prepare_sequence(sentence_premise)
        tag = tag_to_ix[tag]
        output = model(sentence_h, sentence_p).data
        predicted = int((output == torch.max(output)).nonzero()[0])
        if predicted == tag:
            class_tp[tag] += 1
        else:
            class_fn[tag] += 1
            class_fp[predicted] += 1
    for i in range(3):
        prec = class_tp[i] / (class_tp[i] + class_fp[i])
        rec = class_tp[i] / (class_tp[i] + class_fn[i])
        f1 = 2 / ((1/prec) + (1/rec)) if prec != 0 and rec != 0 else 0
        print('F1 score of {0} : {1}, precision: {2}, recall: {3}'.format(classes[i], f1, prec, rec))

*Create an LSTMBaseline network*

In [6]:
lstm = LSTMBaseline(len(words), 300, 100, len(tag_to_ix))

In [7]:
print("Scores before training")
test(lstm)

Scores before training


F1 score of contradiction : 0.450130548302872, precision: 0.2908232118758435, recall: 0.9953810623556582
F1 score of entailment : 0, precision: 0.0, recall: 0.0
F1 score of neutral : 0.0070052539404553416, precision: 0.13333333333333333, recall: 0.0035971223021582736


In [8]:
lstm.back_propagation(1, training_data, tag_to_ix)

In [None]:
print("Scores after training")
test(lstm)

Scores after training


F1 score of entailment : 0.0375, precision: 0.20689655172413793, recall: 0.020618556701030927
F1 score of neutral : 0.07633587786259542, precision: 0.3191489361702128, recall: 0.04335260115606936
F1 score of contradiction : 0.5236985236985237, precision: 0.3647186147186147, recall: 0.928374655647383
