In [5]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
cfg = {
    'DIR': '../dataset/',
    'train_path': 'train.json',
    'test_path': 'test.json',
    'dev_path': 'dev.json',
}

In [7]:
from nltk import word_tokenize

def tokenize(str: str) -> str:
    return ' '.join(word_tokenize(str))

In [8]:
import os

def load_data(path: str) -> json:
    with open(os.path.join(cfg['DIR'], path), 'r') as f:
        data = json.load(f)
    return data

In [9]:
import re

def clean_str(str: str) -> str:
    # remove '\n' character
    str = str.replace('\n', ' ')
    # remove '\t' character
    str = re.sub(r'\\t', ' ', str)
    # remove '\r' character
    str = re.sub(r'\\r', ' ', str)
    # remove more than 2 consecutive occcurance of a character
    str = re.sub(r'(.)\1{2,}', r'\1', str)
    return str.strip().lower()

In [10]:
def clean_data(data: dict) -> None:
    for i in range(len(data['documents'])):
        data['documents'][i]['text'] = clean_str(data['documents'][i]['text'])
        data['documents'][i]['text'] = tokenize(data['documents'][i]['text'])

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import nltk
def get_XY(data: dict, tfidf: TfidfVectorizer, hypothesis: dict, labels: dict, threshold : float = 0.1) -> (list, list):

    X = []
    Y = []

    hypothesis_vecs = {}
    for key, val in hypothesis.items():
        hypothesis_vecs[key] = tfidf.transform([val])

    for i in tqdm(range(len(data["documents"]))):
        doc_text = data["documents"][i]["text"]
        tokenized_doc = nltk.word_tokenize(doc_text)

        for i, span in enumerate(data["documents"][i]["spans"]):
            start_idx = span[0]
            end_idx = span[1]

            # get the span text
            span_text = tokenized_doc[start_idx:end_idx]
            span_text = " ".join(span_text)

            # get the span vector
            span_vector = tfidf.transform([span_text])

            for key, val in hypothesis.items():
                
                spans_for_key = data["documents"][i]["annotation_sets"][0]["annotations"][key]["spans"]
                
                if(len(spans_for_key) == 0):
                    continue
                
                X += [span_vector.toarray().tolist()[0] + hypothesis_vecs[key].toarray().tolist()[0]]
                Y += [1 if i in spans_for_key else 0]
        
    return X, Y
        

In [20]:
def get_hypothesis(data: dict) -> list:
    hypothesis = {}
    for key, value in data['labels'].items():
        hypothesis[key] = clean_str(value['hypothesis'])
    return hypothesis

In [21]:
from sklearn.preprocessing import LabelEncoder

def get_labels() -> dict:
    return {
        'NotMentioned': 0,
        'Entailment': 1,
        'Contradiction': 2,
    }

In [22]:
train = load_data(cfg['train_path'])
clean_data(train)
hypothesis = get_hypothesis(train)
labels = get_labels()

In [23]:
all_text = ""

for i in range(len(train["documents"])):
    all_text += train["documents"][i]["text"] + " "

tfidf = TfidfVectorizer()
tfidf.fit([all_text])

In [24]:
X_train, Y_train = get_XY(train, tfidf, hypothesis, labels=labels)

  0%|          | 0/423 [00:00<?, ?it/s]

  1%|          | 4/423 [00:06<12:45,  1.83s/it]

In [17]:
print(len(X_train), len(Y_train))

340686 340686


In [18]:
from sklearn.svm import SVC

model = SVC(kernel='linear', probability=True)
model.fit(X_train, Y_train)

ValueError: setting an array element with a sequence.