In [1]:
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
cfg = {
    'DIR': '../dataset/',
    'train_path': 'train.json',
    'test_path': 'test.json',
    'dev_path': 'dev.json',
}

In [6]:
import os
def load_data(path: str) -> json:
    with open(os.path.join(cfg['DIR'], path), 'r') as f:
        data = json.load(f)
    return data

In [24]:
import re

def clean_str(str: str) -> str:
    # remove '\n' character
    str = str.replace('\n', ' ')
    # remove '\t' character
    str = re.sub(r'\\t', ' ', str)
    # remove '\r' character
    str = re.sub(r'\\r', ' ', str)
    # remove more than 2 consecutive occcurance of a character
    str = re.sub(r'(.)\1{2,}', r'\1', str)
    return str.strip().lower()

In [25]:
def clean_data(data: dict) -> None:
    for i in range(len(data['documents'])):
        data['documents'][i]['text'] = clean_str(data['documents'][i]['text'])

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
def get_XY(data: dict, tfidf: TfidfVectorizer, hypothesis: dict, labels: dict) -> (list, list):
    
    
    X = []
    Y = []
    for i in range(len(data["documents"])):
        premise = data["documents"][i]["text"]
        # convert premise to tf-idf
        premise_vector = tfidfvectorizer.transform([premise]).toarray().tolist()[0]
        for key, val in hypothesis.items():
            # convert hypothesis to tf-idf
            hypothesis_vector = tfidfvectorizer.transform([val]).toarray().tolist()[0]
            # concatenate premise and hypothesis
            X += [premise_vector + hypothesis_vector]
            Y += [labels[data["documents"][i]["annotation_sets"][0]["annotations"][key]["choice"]]]
    return X, Y
        

In [68]:
def get_hypothesis(data: dict) -> list:
    hypothesis = {}
    for key, value in data['labels'].items():
        hypothesis[key] = clean_str(value['hypothesis'])
    return hypothesis

In [69]:
from sklearn.preprocessing import LabelEncoder

def get_labels() -> dict:
    return {
        'NotMentioned': 0,
        'Entailment': 1,
        'Contradiction': 2,
    }

In [65]:
train = load_data(cfg['train_path'])
clean_data(train)
hypothesis = get_hypothesis(train)
labels = get_labels()

In [49]:
hypothesis

{'nda-11': "receiving party shall not reverse engineer any objects which embody disclosing party's confidential information.",
 'nda-16': 'receiving party shall destroy or return some confidential information upon the termination of agreement.',
 'nda-15': 'agreement shall not grant receiving party any right to confidential information.',
 'nda-10': 'receiving party shall not disclose the fact that agreement was agreed or negotiated.',
 'nda-2': 'confidential information shall only include technical information.',
 'nda-1': 'all confidential information shall be expressly identified by the disclosing party.',
 'nda-19': 'some obligations of agreement may survive termination of agreement.',
 'nda-12': 'receiving party may independently develop information similar to confidential information.',
 'nda-20': 'receiving party may retain some confidential information even after the return or destruction of confidential information.',
 'nda-3': 'confidential information may include verbally co

In [71]:
X_train, Y_train = get_XY(train, hypothesis, labels=labels)

In [80]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model.fit(X_train[:1000], Y_train[:1000])

In [81]:
dev = load_data(cfg['dev_path'])
clean_data(dev)
X_dev, Y_dev = get_XY(dev, hypothesis, labels=labels)
model.score(X_dev, Y_dev)

ValueError: X has 8920 features, but SVC is expecting 21056 features as input.