In [1]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
cfg = {
    'DIR': '../dataset/',
    'train_path': 'train.json',
    'test_path': 'test.json',
    'dev_path': 'dev.json',
}

In [3]:
from nltk import word_tokenize

def tokenize(str: str) -> str:
    return ' '.join(word_tokenize(str))

In [4]:
import os

def load_data(path: str) -> json:
    with open(os.path.join(cfg['DIR'], path), 'r') as f:
        data = json.load(f)
    return data

In [5]:
import re

def clean_str(str: str) -> str:
    # remove '\n' character
    str = str.replace('\n', ' ')
    # remove '\t' character
    str = re.sub(r'\\t', ' ', str)
    # remove '\r' character
    str = re.sub(r'\\r', ' ', str)
    # remove more than 2 consecutive occcurance of a character
    str = re.sub(r'(.)\1{2,}', r'\1', str)
    return str.strip().lower()

In [6]:
def clean_data(data: dict) -> None:
    for i in range(len(data['documents'])):
        data['documents'][i]['text'] = clean_str(data['documents'][i]['text'])
        data['documents'][i]['text'] = tokenize(data['documents'][i]['text'])

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import scipy.sparse as sp
import nltk
def get_XY(data: dict, tfidf: TfidfVectorizer, hypothesis: dict, labels: dict, n_docs : int, threshold : float = 0.1) -> (list, list):

    X = []
    Y = []

    hypothesis_vecs = {}
    for key, val in hypothesis.items():
        hypothesis_vecs[key] = tfidf.transform([val])

    for i in tqdm(range(min(n_docs, len(data["documents"])))):
        doc_text = data["documents"][i]["text"]
        tokenized_doc = nltk.word_tokenize(doc_text)

        for j, span in enumerate(data["documents"][i]["spans"]):
            start_idx = span[0]
            end_idx = span[1]

            # get the span text
            span_text = tokenized_doc[start_idx:end_idx]
            span_text = " ".join(span_text)

            # get the span vector
            span_vector = tfidf.transform([span_text])

            for key, val in hypothesis.items():
                
                spans_for_key = data["documents"][i]["annotation_sets"][0]["annotations"][key]["spans"]
                
                if(len(spans_for_key) == 0):
                    continue
                
                # print(sp.hstack([span_vector, hypothesis_vecs[key]]))
                # convert to 1d vector
                input_vec = sp.hstack([span_vector, hypothesis_vecs[key]])
                # return X, Y
                X += [input_vec]
                Y += [1 if i in spans_for_key else 0]
        
    return X, Y
        

In [8]:
def get_hypothesis(data: dict) -> list:
    hypothesis = {}
    for key, value in data['labels'].items():
        hypothesis[key] = clean_str(value['hypothesis'])
    return hypothesis

In [9]:
from sklearn.preprocessing import LabelEncoder

def get_labels() -> dict:
    return {
        'NotMentioned': 0,
        'Entailment': 1,
        'Contradiction': 2,
    }

In [10]:
train = load_data(cfg['train_path'])
clean_data(train)
hypothesis = get_hypothesis(train)
labels = get_labels()

In [11]:
all_text = ""

for i in range(len(train["documents"])):
    all_text += train["documents"][i]["text"] + " "

tfidf = TfidfVectorizer()
tfidf.fit([all_text])

In [29]:
X_train, Y_train = get_XY(train, tfidf, hypothesis, labels=labels, n_docs=10)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:02<00:00,  3.84it/s]


In [33]:
print(type(X_train[0]))

<class 'scipy.sparse._csr.csr_matrix'>


In [31]:
print(len(X_train), len(Y_train))

9941 9941


In [34]:
X_train_sparse = sp.vstack(X_train)

In [35]:
from sklearn.svm import SVC

model = SVC(kernel='linear', probability=True)
model.fit(X_train_sparse, Y_train)

In [36]:
dev = load_data(cfg['dev_path'])
clean_data(dev)
X_dev, Y_dev = get_XY(dev, tfidf, hypothesis, labels=labels, n_docs=10)

100%|██████████| 10/10 [00:02<00:00,  4.11it/s]


In [37]:
X_dev_sparse = sp.vstack(X_dev)

In [39]:
Y_pred = model.predict(X_dev_sparse)

In [40]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score

def precision_at_80_recall(ypred, ytrue):
    precision, recall, thresholds = precision_recall_curve(ytrue, ypred)
    idx = (abs(recall - 0.8)).argmin()
    return precision[idx]

In [41]:
print(precision_at_80_recall(Y_pred, Y_dev))

0.0


