In [1]:
%load_ext autoreload

In [2]:
%autoreload 1

In [3]:
%aimport helpers

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import numpy as np
from pathlib import Path
import spacy
from spacy.tokens import Doc
from zipfile import ZipFile
import pandas as pd
import json
import os
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
import jellyfish
import itertools as it
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import dill
from Levenshtein import distance
import time
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

In [6]:
from helpers import calc_metrics, importance

In [7]:
CORPUS = "snli_1.0.zip"
MAPPING = {"contradiction": -1,
           "neutral": 0, 
           "entailment": 1}

In [8]:
nlp = spacy.load("en", disable=["textcat"])

#### Data Reading / Converting

In [8]:
def get_data(data):
    for d in data:
        prem = d["sentence1"]
        hyp = d["sentence2"]
        label = d["gold_label"]
        if label != "-":
            yield prem, hyp, label

In [9]:
def prepare_data(prefix="test", folder="snli_1.0"):
    filename = os.path.join(folder, folder+"_"+prefix+".jsonl")
    with ZipFile(CORPUS) as f:
        with f.open(filename) as f_in, open(prefix+".txt", "w+") as f_out:
            lines = (json.loads(line.decode()) for line in f_in) 
            f_out.write("\t".join(["sentence1", "sentence2", "gold_label"])+"\n")
            for record in get_data(lines):
                f_out.write("\t".join(record)+"\n")

In [10]:
def read_prepared(prefix, sep="\t"):
    df = pd.read_csv(prefix+".txt", sep=sep)
    return df

In [19]:
for prefix in ["test", "dev", "train"]:
    prepare_data(prefix)

In [11]:
df_dev = read_prepared("dev")
df_test = read_prepared("test")
df_train = read_prepared("train")

In [12]:
df_train = df_train.dropna()

In [13]:
df_train.shape
df_test.shape
df_dev.shape

(549361, 3)

(9824, 3)

(9842, 3)

In [14]:
df_train.gold_label.value_counts(normalize=True)
df_test.gold_label.value_counts(normalize=True)
df_dev.gold_label.value_counts(normalize=True)

entailment       0.333868
contradiction    0.333451
neutral          0.332681
Name: gold_label, dtype: float64

entailment       0.342834
contradiction    0.329499
neutral          0.327667
Name: gold_label, dtype: float64

entailment       0.338244
contradiction    0.333062
neutral          0.328693
Name: gold_label, dtype: float64

#### Preprocessing

In [9]:
def apply_spacy(df, return_df=False, disable=["textcat"], n_threads=4, size=5000):
    doc_prems = nlp.pipe(df["sentence1"].values, disable=disable, n_threads=n_threads, batch_size=size)
    doc_hyps = nlp.pipe(df["sentence2"].values, disable=disable, n_threads=n_threads, batch_size=size)
    output = zip(doc_prems, doc_hyps, df["gold_label"].values)
    if return_df:
        return pd.DataFrame.from_records(output, columns=["sentence1", "sentence2", "gold_label"])
    return output

In [10]:
def dump_spacy_docs(docs, name="train"):
    vocab_bytes = nlp.vocab.to_bytes()
    doc_bytes = [(prem.to_bytes(tensor=False, user_data=False), hyp.to_bytes(tensor=False, user_data=False), label) 
                 for prem, hyp, label in docs]
    with open(name+"_spacy.bin", "wb+") as handle:
        dill.dump((doc_bytes, vocab_bytes), handle)

In [11]:
def load_spacy_docs(name="train"):
    with open(name+"_spacy.bin", "rb") as handle:
        doc_bytes, vocab_bytes = dill.load(handle)
        
    nlp.vocab.from_bytes(vocab_bytes)
    docs = [(Doc(nlp.vocab).from_bytes(prem), Doc(nlp.vocab).from_bytes(hyp), label) 
            for prem, hyp, label in doc_bytes]
    return docs

In [12]:
def apply_dump(df, name="train", n_threads=4, size=5000):
    docs = apply_spacy(df, return_df=False, n_threads=n_threads, size=size)
    dump_spacy_docs(docs, name)

In [13]:
def dump_all(n=5000):
    apply_dump(df_dev, "dev") 
    apply_dump(df_test, "test")
    apply_dump(df_train.iloc[:n], "train", size=5000)

In [14]:
def load_all():
    test_docs = load_spacy_docs("test")
    train_docs = load_spacy_docs("train")
    dev_docs = load_spacy_docs("dev")
    return train_docs, test_docs, dev_docs

In [21]:
t0 = time.time()
n = 100000
#dump_all(n)
print(time.time()-t0)

0.00010991096496582031


In [25]:
apply_dump(df_train.iloc[:n], "train_sample")

#### Feature Building

In [15]:
train_docs, test_docs, dev_docs = load_all()

In [16]:
SDX = jellyfish.soundex
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

In [62]:
def pos_spacy_to_nltk(tok):
    if tok.pos_ == "VERB":
        return wn.VERB
    elif tok.pos_ == "ADV":
        return wn.ADV
    elif tok.pos_ in ["NOUN", "PROPN", "PRON"]:
        return wn.NOUN
    elif tok.pos_ == "ADJ":
        return wn.ADJ
    return None

In [63]:
def filter_stop(doc):
    return (token for token in doc if not token.is_stop)

In [64]:
def filter_pos(doc, POS=["NOUN"]):
    return (token for token in doc if token.pos_ in POS)

In [65]:
def doc_to_toks(doc, how="lower"):
    if how == "lower":
        return [tok.lower_ for tok in doc]
    elif how == "lemma":
        return [tok.lemma_ if tok.lemma_!="-PRON-" else tok.lower_ for tok in doc]
    elif how == "text":
        return [tok.text for tok in doc]
    elif how == "text_ws":
        return [tok.text_with_ws for tok in doc]

In [66]:
def compare_ents(prem, hyp, threshold=-2, binary=True):
    ents_1 = prem.ents
    ents_2 = hyp.ents
    score = 0
    for hyp_ent in ents_2:
        hyp_text = "".join(token.text_with_ws for token in filter_stop(hyp_ent))
        for prem_ent in ents_1:
            prem_text = "".join(token.text_with_ws for token in filter_stop(prem_ent))
            dist = -1 * distance(SDX(prem_text), SDX(hyp_text))
            if dist >= threshold:
                break
        else:
            score += 1
            if binary:
                return score
    return score

In [67]:
def len_diff(prem, hyp):
    return len(hyp.text) - len(prem.text)

In [68]:
def BLEU(prem_tokens, hyp_tokens, weights=[0.25]*4):
    return sentence_bleu([prem_tokens], hyp_tokens, weights=weights)

In [121]:
THRESHOLD = 0.8

In [233]:
def find_max_lin_sim(a, b, pos_a, pos_b):
    s1 = wn.synsets(a, pos=pos_a)
    s2 = wn.synsets(b, pos=pos_b)
    try:
        sims = [a.lin_similarity(b, semcor_ic) for a,b in it.product(s1, s2) if a.pos()==b.pos()]
        if not sims:
            return 0.0
        return np.max(sims)
    except:
        return 0.0

In [235]:
def jaccard_similarity_with_sim(prem, hyp):
    s1 = set([(t.lower_, pos_spacy_to_nltk(t)) for t in prem])
    s2 = set([(t.lower_, pos_spacy_to_nltk(t)) for t in hyp])
    if not s1 and not s2:
        return 0, 0
    overlap = 0
    for t2, pos2 in s2:
        for t1, pos1 in s1:
            if t1 == t2 or find_max_lin_sim(t1, t2, pos1, pos2) >= THRESHOLD:
                overlap += 1
                break
    try:
        return overlap / (len(s1)+len(s2)-overlap), overlap
    except:
        return 0, 0

In [None]:
def jaccard_similarity(prem_tokens, hyp_tokens):
    s1 = set(prem_tokens)
    s2 = set(hyp_tokens)
    if not s1 and not s2:
        return 0, 0
    return len(s1 & s2) / len(s1 | s2), len(s1 & s2)

In [71]:
def zipngram(doc, n=2):
    return zip(*[doc[i:] for i in range(n)])

In [72]:
def get_synonyms(word):
    return set(lemma.name() for syns in wn.synsets(word) for lemma in syns.lemmas())

In [73]:
EXCLUDE_LEMMAS = ["be", "have"]

In [74]:
def verb_matching(prem, hyp, binary=True):
    prem_verbs = [tok for tok in doc_to_toks(filter_pos(prem, ["VERB"]), "lemma") if tok not in EXCLUDE_LEMMAS]
    hyp_verbs = [tok for tok in doc_to_toks(filter_pos(hyp, ["VERB"]), "lemma") if tok not in EXCLUDE_LEMMAS]
    score = 0
    for v1 in hyp_verbs:
        for v2 in prem_verbs:
            if v1 == v2 or v1 in get_synonyms(v2):
                break
        else:
            score += 1
            if binary:
                return score
    return score

In [239]:
def build_features(docs):
    features = []
    for prem, hyp, _ in docs:
        record = {}
        prem_tokens = doc_to_toks(prem, how="lower")
        hyp_tokens = doc_to_toks(hyp, how="lower")
        # Unlexicalized features
        record["NE_Score"] = compare_ents(prem, hyp)
        record["len_diff"] = len_diff(prem, hyp)
        record["BLEU"] = BLEU(prem_tokens, hyp_tokens)
        
        how = "lower"
        record["jaccard_total"], record["overlap_total"] = jaccard_similarity_with_sim(prem, hyp)
        
        POS = ["NOUN", "PRON", "PROPN"]
        record["jaccard_NOUN"], record["overlap_NOUN"] = jaccard_similarity_with_sim(filter_pos(prem, POS), filter_pos(hyp, POS))
        
        POS = ["VERB"]
        record["jaccard_VERB"], record["overlap_VERB"] = jaccard_similarity_with_sim(filter_pos(prem, POS), filter_pos(hyp, POS))
        
        POS = ["ADJ"]
        record["jaccard_ADJ"], record["overlap_ADJ"] = jaccard_similarity_with_sim(filter_pos(prem, POS), filter_pos(hyp, POS))
        
        POS = ["ADV"]
        record["jaccard_ADV"], record["overlap_ADV"] = jaccard_similarity_with_sim(filter_pos(prem, POS), filter_pos(hyp, POS))
        # Unigrams & Bigrams
        for tok in hyp_tokens:
            record[f"w={tok}"] = 1
            
        for w1, w2 in zipngram(hyp_tokens, 2):
            record[f"w1={w1},w2={w2}"] = 1
        
        # Cross - unigrams with the same POS
        for w1, w2 in it.product(prem, hyp):
            if w1.pos_ == w2.pos_:
                record[f"cross-w1={w1.lemma_},w2={w2.lemma_}"] = 1
        
        # Cross - bigrams
        for b1, b2 in it.product(zipngram(prem, 2), zipngram(hyp, 2)):
            if b1[-1].pos_ == b2[-1].pos_:
                record[f"cross-w11={b1[0].lemma_},w12={b1[1].lemma_},w21={b2[0].lemma_},w22={b2[1].lemma_}"] = 1
        
        # Verb matching useing WordNet
        record["verb_match"] = verb_matching(prem, hyp)
        
        features.append(record)
        
    return features

In [236]:
vectorizer = DictVectorizer(sparse=True)

In [None]:
train_features = build_features(train_docs)
_, _, train_labels = zip(*train_docs)
v_train = vectorizer.fit_transform(train_features)

In [None]:
test_features = build_features(test_docs)
_, _, test_labels = zip(*test_docs)
v_test = vectorizer.transform(test_features)

In [None]:
dev_features = build_features(dev_docs)
_, _, dev_labels = zip(*dev_docs)
v_dev = vectorizer.transform(dev_features)

#### Model Training

In [244]:
clf = LogisticRegression(C=1, random_state=25)

In [246]:
clf.fit(v_train, train_labels)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=25, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
importance(clf.classes_, clf.coef_, vectorizer.feature_names_, n=10);

In [37]:
pred = clf.predict(v_test)
output, report, conf_matrix = calc_metrics(test_labels, pred, proba=None, labels=clf.classes_, 
                                           print_=True, mode="weighted")

Recall: 0.787
Precision: 0.787
F1: 0.786
Accuracy: 0.787

Confusion matrix:
               pred_contradiction  pred_entailment  pred_neutral
contradiction                2548              304           385
entailment                    161             2857           350
neutral                       399              496          2324

Report:
               precision    recall  f1-score   support

contradiction       0.82      0.79      0.80      3237
   entailment       0.78      0.85      0.81      3368
      neutral       0.76      0.72      0.74      3219

  avg / total       0.79      0.79      0.79      9824



In [38]:
pred = clf.predict(v_dev)
output, report, conf_matrix = calc_metrics(dev_labels, pred, proba=None, labels=clf.classes_, 
                                           print_=True, mode="weighted")

Recall: 0.784
Precision: 0.784
F1: 0.784
Accuracy: 0.784

Confusion matrix:
               pred_contradiction  pred_entailment  pred_neutral
contradiction                2599              291           388
entailment                    179             2829           321
neutral                       435              507          2293

Report:
               precision    recall  f1-score   support

contradiction       0.81      0.79      0.80      3278
   entailment       0.78      0.85      0.81      3329
      neutral       0.76      0.71      0.74      3235

  avg / total       0.78      0.78      0.78      9842

