In [2]:
import pandas as pd
import torch
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [5]:
cfg = {
    'DIR': '../dataset/',
    'train_path': 'train.json',
    'test_path': 'test.json',
    'dev_path': 'dev.json',
}

In [6]:
import os

def load_data(path: str) -> json:
    with open(os.path.join(cfg['DIR'], path), 'r') as f:
        data = json.load(f)
    return data

In [7]:
import re

def clean_str(str: str) -> str:
    # remove '\n' character
    str = str.replace('\n', ' ')
    # remove '\t' character
    str = re.sub(r'\\t', ' ', str)
    # remove '\r' character
    str = re.sub(r'\\r', ' ', str)
    # remove more than 2 consecutive occcurance of a character
    str = re.sub(r'(.)\1{2,}', r'\1', str)
    return str.strip().lower()

In [8]:
def clean_data(data: dict) -> None:
    for i in range(len(data['documents'])):
        data['documents'][i]['text'] = clean_str(data['documents'][i]['text'])

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
def get_Ypred_Ytrue(data: dict, tfidf: TfidfVectorizer, hypothesis: dict, labels: dict) -> (list, list):
    
    X = []
    Y = []

    Y_pred = []
    Y_true = []

    hypothesis_vecs = {}
    for key, val in hypothesis.items():
        hypothesis_vecs[key] = tfidf.transform([val])

    for i in tqdm(range(len(data["documents"]))):
        doc_text = data["documents"][i]["text"]
        tokenized_doc = nltk.word_tokenize(doc_text)

        for i, span in enumerate(data["documents"][i]["spans"]):
            start_idx = span[0]
            end_idx = span[1]

            # get the span text
            span_text = tokenized_doc[start_idx:end_idx]
            span_text = " ".join(span_text)

            # get the span vector
            span_vector = tfidf.transform([span_text])

            for key, val in hypothesis.items():
                cosine_sim = cosine_similarity(span_vector, hypothesis_vecs[key])[0][0]
                
                if cosine_sim > 0.3:
                    Y_pred.append(1)
                else: 
                    Y_pred.append(0)

                if i in data["documents"][i]["annotation_sets"][0]["annotations"][key]["spans"]:
                    Y_true.append(1)
                else:
                    Y_true.append(0)
                
    return Y_pred, Y_true
        

In [22]:
def get_hypothesis(data: dict) -> list:
    hypothesis = {}
    for key, value in data['labels'].items():
        hypothesis[key] = clean_str(value['hypothesis'])
    return hypothesis

In [23]:
from sklearn.preprocessing import LabelEncoder

def get_labels() -> dict:
    return {
        'NotMentioned': 0,
        'Entailment': 1,
        'Contradiction': 2,
    }

In [24]:
train = load_data(cfg['train_path'])
clean_data(train)
hypothesis = get_hypothesis(train)
labels = get_labels()

In [25]:
all_text = ""

for i in range(len(train["documents"])):
    all_text += train["documents"][i]["text"] + " "

tfidf = TfidfVectorizer()
tfidf.fit([all_text])

In [26]:
Y_pred, Y_true = get_Ypred_Ytrue(train, tfidf, hypothesis, labels)

  0%|          | 0/423 [00:00<?, ?it/s]

100%|██████████| 423/423 [05:37<00:00,  1.25it/s]


In [27]:
from sklearn.metrics import precision_recall_curve

def precision_at_80_recall(ypred, ytrue):
    precision, recall, thresholds = precision_recall_curve(ytrue, ypred)
    idx = (abs(recall - 0.8)).argmin()
    return precision[idx]

In [28]:
print(precision_at_80_recall(Y_pred, Y_true))

0.016254928784099138
