In [1]:
import pandas as pd
import torch
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

In [2]:
test_string = "I am a boy. I live in Hyderabad."
test_string = test_string.lower()

# Tokenize the string
tokens = nltk.word_tokenize(test_string)
print(tokens)

['i', 'am', 'a', 'boy', '.', 'i', 'live', 'in', 'hyderabad', '.']


In [3]:
start_token_idx = 1
end_token_idx = 2

def get_span(text, start_token_idx, end_token_idx):
    tokenzied = nlp(text)
    start_token = tokenzied.sentences[0].tokens[start_token_idx]
    end_token = tokenzied.sentences[0].tokens[end_token_idx]
    return start_token.start_char, end_token.end_char

In [1]:
cfg = {
    'DIR': '../dataset/',
    'train_path': 'train.json',
    'test_path': 'test.json',
    'dev_path': 'dev.json',
}

In [None]:
import os

def load_data(path: str) -> json:
    with open(os.path.join(cfg['DIR'], path), 'r') as f:
        data = json.load(f)
    return data

In [None]:
import re

def clean_str(str: str) -> str:
    # remove '\n' character
    str = str.replace('\n', ' ')
    # remove '\t' character
    str = re.sub(r'\\t', ' ', str)
    # remove '\r' character
    str = re.sub(r'\\r', ' ', str)
    # remove more than 2 consecutive occcurance of a character
    str = re.sub(r'(.)\1{2,}', r'\1', str)
    return str.strip().lower()

In [None]:
def clean_data(data: dict) -> None:
    for i in range(len(data['documents'])):
        data['documents'][i]['text'] = clean_str(data['documents'][i]['text'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def get_XY(data: dict, tfidf: TfidfVectorizer, hypothesis: dict, labels: dict) -> (list, list):
    
    X = []
    Y = []
    for i in range(len(data["documents"])):
        doc_text = data["documents"][i]["text"]
        tokenized_doc = nltk.word_tokenize(doc_text)

        for key, val in hypothesis.items():

            hypothesis_vector = tfidf.transform([val]).toarray().tolist()[0]

            for span in data["documents"][i]["annotation_sets"][0]["annotations"][key]["spans"]:
                start_idx = span[0]
                end_idx = span[1]

                # get the span text
                span_text = tokenized_doc[start_idx:end_idx]
                span_text = " ".join(span_text)

                # get the span vector
                premise_vector = tfidf.transform([span_text]).toarray().tolist()[0]

                # concatenate premise and hypothesis
                X += [premise_vector + hypothesis_vector]
                Y += [labels[data["documents"][i]["annotation_sets"][0]["annotations"][key]["choice"]]]
                
    return X, Y
        