In [1]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
cfg = {
    'DIR': '../dataset/',
    'train_path': 'train.json',
    'test_path': 'test.json',
    'dev_path': 'dev.json',
}

In [3]:
from nltk import word_tokenize

def tokenize(str: str) -> str:
    return ' '.join(word_tokenize(str))

In [4]:
import os

def load_data(path: str) -> json:
    with open(os.path.join(cfg['DIR'], path), 'r') as f:
        data = json.load(f)
    return data

In [5]:
import re

def clean_str(str: str) -> str:
    # remove '\n' character
    str = str.replace('\n', ' ')
    # remove '\t' character
    str = re.sub(r'\\t', ' ', str)
    # remove '\r' character
    str = re.sub(r'\\r', ' ', str)
    # remove more than 2 consecutive occcurance of a character
    str = re.sub(r'(.)\1{2,}', r'\1', str)
    return str.strip().lower()

In [6]:
def clean_data(data: dict) -> None:
    for i in range(len(data['documents'])):
        data['documents'][i]['text'] = clean_str(data['documents'][i]['text'])
        data['documents'][i]['text'] = tokenize(data['documents'][i]['text'])

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
def get_XY(data: dict, tfidf: TfidfVectorizer, hypothesis: dict, labels: dict) -> (list, list):
    
    X = []
    Y = []
    for i in range(len(data["documents"])):
        premise = data["documents"][i]["text"]
        # convert premise to tf-idf
        premise_vector = tfidf.transform([premise]).toarray().tolist()[0]
        for key, val in hypothesis.items():
            # convert hypothesis to tf-idf
            hypothesis_vector = tfidf.transform([val]).toarray().tolist()[0]
            # concatenate premise and hypothesis
            X += [premise_vector + hypothesis_vector]
            Y += [labels[data["documents"][i]["annotation_sets"][0]["annotations"][key]["choice"]]]
    return X, Y

In [14]:
def get_hypothesis(data: dict) -> list:
    hypothesis = {}
    for key, value in data['labels'].items():
        hypothesis[key] = clean_str(value['hypothesis'])
    return hypothesis

In [15]:
from sklearn.preprocessing import LabelEncoder

def get_labels() -> dict:
    return {
        'NotMentioned': 0,
        'Entailment': 1,
        'Contradiction': 2,
    }

In [16]:
train = load_data(cfg['train_path'])
clean_data(train)
hypothesis = get_hypothesis(train)
labels = get_labels()

In [17]:
all_text = ""

for i in range(len(train["documents"])):
    all_text += train["documents"][i]["text"] + " "

tfidf = TfidfVectorizer()
tfidf.fit([all_text])

In [18]:
X_train, Y_train = get_XY(train, tfidf, hypothesis, labels=labels)

In [19]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model.fit(X_train[:1000], Y_train[:1000])

In [22]:
dev = load_data(cfg['dev_path'])
clean_data(dev)
X_dev, Y_dev = get_XY(dev, tfidf, hypothesis, labels=labels)
model.score(X_dev, Y_dev)

0.6808100289296046