## Логічне слідування

In [1]:
import sys
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import spacy
nlp = spacy.load('en_core_web_md')
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

In [2]:
import random
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet as wn
from nltk.translate.bleu_score import sentence_bleu
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression



In [3]:
PATH = '/mnt/hdd/Data/NLP/snli_1.0/'
snli_train = pd.read_csv(PATH+'snli_1.0_train.txt', sep='\t')
snli_dev = pd.read_csv(PATH+'snli_1.0_dev.txt', sep='\t')
snli_test = pd.read_csv(PATH+'snli_1.0_test.txt', sep='\t')

Швидкий огляд даних.

In [4]:
snli_train.head(2)

Unnamed: 0,gold_label,sentence1_binary_parse,sentence2_binary_parse,sentence1_parse,sentence2_parse,sentence1,sentence2,captionID,pairID,label1,label2,label3,label4,label5
0,neutral,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,3416050480.jpg#4,3416050480.jpg#4r1n,neutral,,,,
1,contradiction,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",3416050480.jpg#4,3416050480.jpg#4r1c,contradiction,,,,


In [5]:
snli_train['gold_label'].value_counts()

entailment       183416
contradiction    183187
neutral          182764
-                   785
Name: gold_label, dtype: int64

Чому деякі речення марковані через "-"?

In [6]:
print(snli_train[snli_train['gold_label']=='-'].iloc[145]['sentence1'])
print(snli_train[snli_train['gold_label']=='-'].iloc[145]['sentence2'])

A fat white boy in a tank top swings at a checkered black and white punching bag at a carnival while people look on and walk by in the background.
The boy is playing a game.


Зрозуміло не стало. Краще їх просто ігнорувати.

In [7]:
snli_train = snli_train[snli_train['gold_label'] != '-'].dropna(subset=['sentence2'])
snli_dev = snli_dev[snli_dev['gold_label'] != '-']
snli_test = snli_test[snli_test['gold_label'] != '-']

In [8]:
snli_train.dropna(subset=['sentence2']).groupby('gold_label')['sentence2'].agg([lambda x: x.apply(len).mean()])

Unnamed: 0_level_0,<lambda>
gold_label,Unnamed: 1_level_1
contradiction,36.83223
entailment,33.510801
neutral,42.095704


"Бейзлайн" із вибором класу залежно від довжини гіпотези.

In [9]:
def len_baseline(sentence):
    if len(sentence) < 33:
        return 'entailment'
    elif len(sentence) < 41:
        return 'contradiction'
    else:
        return 'neutral'
    
print(classification_report(snli_test['gold_label'], snli_test['sentence2'].apply(len_baseline)))

               precision    recall  f1-score   support

contradiction       0.36      0.23      0.28      3237
   entailment       0.42      0.55      0.48      3368
      neutral       0.46      0.48      0.47      3219

  avg / total       0.42      0.42      0.41      9824



Препроцесинг: negation scope (додати NOT_), стемінг, викинути службові слова (як рекомендують у https://pdfs.semanticscholar.org/2d7d/f0b5ac15cdaa50928031f5bb2fc63a0a1f68.pdf)

In [10]:
def negation_scope(sent):
    doc = nlp(sent, disable=['tagger', 'parser', 'ner'])
    prepend = False
    new_sent = ''
    for token in doc:
        if prepend and not token.is_punct:
            new_sent += 'NOT_'+token.text_with_ws
        elif token.is_punct:
            prepend = False
            new_sent += token.text_with_ws
        elif token.lower in ('not', "n't", 'never', 'no', 'nobody', 'none', 'no'):
            prepend = True
            new_sent += token.text_with_ws
        else:
            new_sent += token.text_with_ws
    return new_sent

In [11]:
def get_lemmas(sent):
    doc = nlp(sent, disable=['parser', 'ner'])
    lemmas = [(w.lemma_, w.pos_) for w in doc if (not w.pos_ in ('PART', 'CONJ', 'DET', 'ADP', 'INTJ') and not w.is_punct)]
    return lemmas

def preprocess_and_stem(sent):
    new_sent = negation_scope(sent)
    doc = nlp(new_sent, disable=['parser', 'ner'])
    words = [w.text.lower() for w in doc if (not w.pos_ in ('PART', 'CONJ', 'DET', 'ADP', 'INTJ') and not w.is_punct)]
    stems = [snowball.stem(w) for w in words]
    return stems

def dep_relations(sent):
    doc = nlp(sent, disable=['ner'])
    triples = [(t.dep_, t.lemma_, t.head.lemma_) for t in doc]
    duples = [(t.lemma_, t.head.lemma_) for t in doc]
    return triples, duples

In [12]:
def semantic_overlap(s, h):
    """
    Here s and h are lemmatized and tokenized lists of (word, tag) tuples.
    """
    pos_map = {'NOUN': wn.NOUN, 'VERB': wn.VERB,
               'ADJ': wn.ADJ, 'ADV': wn.ADV}
    cross_u = cross_unigrams_pos(s, h)
    overlap = 0
    for pair in cross_u:
        synsets = wn.synsets(pair[0][0], pos = pos_map.get(pair[0][1], None))
        if not synsets:
            continue
        synonyms = []
        for sset in synsets:
            synonyms.extend(sset.lemma_names())
        if pair[1][0] in synonyms:
            overlap += 1
    return overlap, overlap/len(s)

def cross_unigrams_pos(stokens, htokens):
    res = []
    for sw in stokens:
        for hw in htokens:
            if sw[1] == hw[1]:
                res.append((sw, hw))
    return list(set(res))

Для фіч класифікатора ми візьмемо збіг коренів слів між двома реченнями, збіг смислів (функція semantic_overlap), збіг між залежностями, а також довжину різниці між реченнями.

In [13]:
def get_features(s, h):
    """
    """
    slemmas = get_lemmas(s)
    hlemmas = get_lemmas(h)
    # "semantic overlap" using WordNet
    sem_overlap_count, sem_overlap_perc = semantic_overlap(slemmas, hlemmas)
    # usual overlap using Snowball stemming and accounting for negation
    sstems = preprocess_and_stem(s)
    hstems = preprocess_and_stem(h)
    overlap_count = len(set(sstems).intersection(set(hstems)))
    overlap_perc = overlap_count/len(set(sstems))
    # length features
    len_diff = len(s) - len(h)
    len_token_diff = len(sstems) - len(hstems)
    len_diff_perc = len_diff/len(s)
    # dependency features
    sdep3, sdep2 = dep_relations(s)
    hdep3, hdep2 = dep_relations(h)
    dep3overlap = set(sdep3).intersection(set(hdep3))
    dep2overlap = set(sdep2).intersection(set(hdep2))
    dep3count = len(dep3overlap)
    dep2count = len(dep2overlap)
    dep3perc = dep3count/len(sdep3)
    dep2perc = dep2count/len(sdep2)
    # initialize features dict
    features = {
        'sem_overlap_count': sem_overlap_count,
        'sem_overlap_perc': sem_overlap_perc,
        'overlap_count': overlap_count,
        'overlap_perc': overlap_perc,
        'len_diff': len_diff,
        'len_token_diff': len_token_diff,
        'len_diff_perc': len_diff_perc,
        'dep3count': dep3count,
        'dep2count': dep2count,
        'dep3perc': dep3perc,
        'dep2perc': dep2perc
    }
    return features

Натренуємо на невеликій вибірці із тренувального датасету.

In [14]:
train_sample = snli_train.sample(30000, random_state=505)
#dev_sample = snli_dev.sample(2000, random_state=505)

train_features, train_labels = [], []
j = 1
for i, row in train_sample.iterrows():
    sys.stdout.write('\rProcessing train row {}'.format(j))
    j += 1
    train_features.append(get_features(row['sentence1'], row['sentence2']))
    train_labels.append(row['gold_label'])    

dev_features, dev_labels = [], []
j = 1
for i, row in snli_dev.iterrows():
    sys.stdout.write('\rProcessing dev row {}'.format(j))
    j += 1
    dev_features.append(get_features(row['sentence1'], row['sentence2']))
    dev_labels.append(row['gold_label'])

Processing dev row 9842

In [15]:
vec = DictVectorizer()
train_vec = vec.fit_transform(train_features)
dev_vec = vec.transform(dev_features)
clf = LogisticRegression(penalty='l1', random_state=505)
clf.fit(train_vec, train_labels)
pred_labels = clf.predict(dev_vec)

In [16]:
print(classification_report(dev_labels, pred_labels))
print('\nAccuracy is', round(accuracy_score(dev_labels, pred_labels), 3))

               precision    recall  f1-score   support

contradiction       0.48      0.52      0.50      3278
   entailment       0.53      0.70      0.61      3329
      neutral       0.53      0.32      0.40      3235

  avg / total       0.51      0.51      0.50      9842


Accuracy is 0.514


Для покращеного класифікатора використаємо ознаки з https://nlp.stanford.edu/pubs/snli_paper.pdf, плюс семантичний збіг (за Ворднетом) та збіг за коренями слів із попереднього класифікатора. Збіг за залежностями не використано - він дуже уповільнює тренування, але дає невелике покращення (принаймні в теперішньому вигляді). Зокрема, серед фіч: метрика BLEU для виявлення подібності речень; різниця в довжині; збіг між реченнями, зокрема окремо збіг між словами зі спільними частинами мови; уніграми та біграми слів; "крос-уніграми" для слів зі спільною частиною мови, та "крос-біграми" для пар слів зі спільною частиною мови у другого слова.

In [17]:
def choose_by_pos(tokens, pos):
    """
    Take only words of a specified POS.
    """
    if pos == 'NOUN':
        return [t[0] for t in tokens if t[1] in ('PROPN', 'NOUN')]
    else:
        return [t[0] for t in tokens if t[1] == pos]
    
def cross_unigrams(stokens, htokens):
    res = []
    for sw in stokens:
        for hw in htokens:
            if sw[1] == hw[1]:
                res.append((sw[0], hw[0]))
    return list(set(res))

def cross_bigrams(stokens, htokens):
    res = []
    sbigrams = nltk.bigrams(stokens)
    hbigrams = nltk.bigrams(htokens)
    for sb in sbigrams:
        for hb in hbigrams:
            if sb[1][1] == hb[1][1]:
                res.append(([s[0] for s in sb], [h[0] for h in hb]))
    return res

def extract_features(s, h):
    """
    s, h are sentence1 (premise) and hypothesis (sentence2).
    Right now using word lemmas, not stems.
    """
    sdoc = nlp(s, disable=['parser', 'ner'])
    hdoc = nlp(h, disable=['parser', 'ner'])
    s_tokens = [(t.lemma_, t.pos_) for t in sdoc if not t.is_punct]
    h_tokens = [(t.lemma_, t.pos_) for t in hdoc if not t.is_punct]
    swords = [t[0] for t in s_tokens]
    hwords = [t[0] for t in h_tokens]
    # initialize a dictionary
    features = {}
    # using WordNet
    sem_overlap_count, sem_overlap_perc = semantic_overlap(s_tokens, h_tokens)
    # BLEU scores for 1, 2, 3, 4-grams and then cumulative BLEU for all n
    bleu1 = sentence_bleu([swords], hwords, weights=(1, 0, 0, 0))
    bleu2 = sentence_bleu([swords], hwords, weights=(0, 1, 0, 0))
    bleu3 = sentence_bleu([swords], hwords, weights=(0, 0, 1, 0))
    bleu4 = sentence_bleu([swords], hwords, weights=(0, 0, 0, 1))
    bleu_cum = sentence_bleu([swords], hwords, weights=(0.25, 0.25, 0.25, 0.25))
    features.update({
        'bleu1': bleu1,
        'bleu2': bleu2,
        'bleu3': bleu3,
        'bleu4': bleu4,
        'sem_overlap': sem_overlap_count,
        'sem_overlap_perc': sem_overlap_perc,
        'bleu_cum': bleu_cum
    })
    # length difference
    lendiff = len(swords) - len(hwords)
    features.update({'lendiff': lendiff})
    # overlaps
    sstems = preprocess_and_stem(s)
    hstems = preprocess_and_stem(h)
    stem_overlap_count = len(set(sstems).intersection(set(hstems)))
    stem_overlap_perc = stem_overlap_count/len(set(sstems))
    overlap = set(swords).intersection(set(hwords))
    overlap_count = len(overlap)
    overlap_perc = overlap_count/len(set(swords))
    snouns = set(choose_by_pos(s_tokens, 'NOUN'))
    sverbs = set(choose_by_pos(s_tokens, 'VERB'))
    sadjs = set(choose_by_pos(s_tokens, 'ADJ'))
    sadvs = set(choose_by_pos(s_tokens, 'ADV'))
    noun_overlap = snouns.intersection(set(choose_by_pos(h_tokens, 'NOUN')))
    verb_overlap = sverbs.intersection(set(choose_by_pos(h_tokens, 'VERB')))
    adj_overlap = sadjs.intersection(set(choose_by_pos(h_tokens, 'ADJ')))
    adv_overlap = sadvs.intersection(set(choose_by_pos(h_tokens, 'ADV')))
    noun_count = len(noun_overlap)
    verb_count = len(verb_overlap)
    adj_count = len(adj_overlap)
    adv_count = len(adv_overlap)
    noun_perc = noun_count/len(snouns) if len(snouns) > 0 else 0
    verb_perc = verb_count/len(sverbs) if len(sverbs) > 0 else 0
    adj_perc = adj_count/len(sadjs) if len(sadjs) > 0 else 0
    adv_perc = adv_count/len(sadvs) if len(sadvs) > 0 else 0
    features.update({
        'overlap_count': overlap_count,
        'overlap_perc': overlap_perc,
        'stem_overlap_count': stem_overlap_count,
        'stem_overlap_perc': stem_overlap_perc,
        'noun_count': noun_count,
        'verb_count': verb_count,
        'adj_count': adj_count,
        'adv_count': adv_count,
        'noun_perc': noun_perc,
        'verb_perc': verb_perc,
        'adj_perc': adj_perc,
        'adv_perc': adv_perc
    })
    # unigrams and bigrams for hypothesis
    hbigrams = nltk.bigrams(hwords)
    for w in hwords:
        features['word={}'.format(w)] = 1
    for b in hbigrams:
        features['bigram={w1}_{w2}'.format(w1=b[0], w2=b[1])] = 1
    # cross-unigrams
    cross_u = cross_unigrams(s_tokens, h_tokens)
    for u in cross_u:
        features['cu={w1}_{w2}'.format(w1=u[0], w2=u[1])] = 1
    # cross-bigrams
    cross_b = cross_bigrams(s_tokens, h_tokens)
    for b in cross_b:
        w1, w2 = b[0]
        w3, w4 = b[1]
        features['cb={w1}_{w2}_{w3}_{w4}'.format(w1=w1, w2=w2, w3=w3, w4=w4)] = 1
    return features

Натренуємо, так само, на вибірці з 30000 рядків.

In [18]:
train_features, train_labels = [], []
j = 1
for i, row in train_sample.iterrows():
    sys.stdout.write('\rProcessing train row {}'.format(j))
    j += 1
    train_features.append(extract_features(row['sentence1'], row['sentence2']))
    train_labels.append(row['gold_label'])    

dev_features, dev_labels = [], []
j = 1
for i, row in snli_dev.iterrows():
    sys.stdout.write('\rProcessing dev row {}'.format(j))
    j += 1
    dev_features.append(extract_features(row['sentence1'], row['sentence2']))
    dev_labels.append(row['gold_label'])

Processing train row 15

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Processing dev row 9842

In [19]:
vec = DictVectorizer()
train_vec = vec.fit_transform(train_features)
dev_vec = vec.transform(dev_features)
clf = LogisticRegression(penalty='l1', random_state=505)
clf.fit(train_vec, train_labels)
pred_labels = clf.predict(dev_vec)

In [20]:
print(classification_report(dev_labels, pred_labels))
print('\nAccuracy is ', round(accuracy_score(dev_labels, pred_labels), 3))

               precision    recall  f1-score   support

contradiction       0.74      0.71      0.73      3278
   entailment       0.70      0.77      0.74      3329
      neutral       0.67      0.62      0.64      3235

  avg / total       0.70      0.70      0.70      9842


Accuracy is  0.702


Класифікатор не дуже добре працює з нейтральними гіпотезами, але загалом looks good. Можна спробувати на повній вибірці і подивитись, наскільки покращиться якість завдяки більшій кількості тренувальних даних.

In [21]:
train_features, train_labels = [], []
for i, row in snli_train.iterrows():
    sys.stdout.write("\rProcessing row %i" % i)
    train_features.append(extract_features(row['sentence1'], row['sentence2']))
    train_labels.append(row['gold_label'])

Processing row 19

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Processing row 550151

In [22]:
vec = DictVectorizer()
train_vectorized = vec.fit_transform(train_features)
clf = LogisticRegression(penalty='l1', random_state=505)
clf.fit(train_vectorized, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=505, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
dev_features, dev_labels = [], []
for i, row in snli_dev.iterrows():
    dev_features.append(extract_features(row['sentence1'], row['sentence2']))
    dev_labels.append(row['gold_label'])

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [24]:
dev_vectorized = vec.transform(dev_features)
dev_pred_labels = clf.predict(dev_vectorized)

In [25]:
print(classification_report(dev_labels, dev_pred_labels, digits=3))

               precision    recall  f1-score   support

contradiction      0.804     0.806     0.805      3278
   entailment      0.797     0.848     0.822      3329
      neutral      0.761     0.709     0.734      3235

  avg / total      0.787     0.788     0.787      9842



In [29]:
print(round(accuracy_score(dev_labels, dev_pred_labels),3))

0.788


Тепер на власне тестовій вибірці.

In [27]:
test_features, test_labels = [], []
for i, row in snli_test.iterrows():
    test_features.append(extract_features(row['sentence1'], row['sentence2']))
    test_labels.append(row['gold_label'])

test_vectorized = vec.transform(test_features)
test_pred_labels = clf.predict(test_vectorized)
print(classification_report(test_labels, test_pred_labels, digits=3))

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


               precision    recall  f1-score   support

contradiction      0.811     0.806     0.808      3237
   entailment      0.792     0.847     0.818      3368
      neutral      0.766     0.715     0.739      3219

  avg / total      0.790     0.790     0.789      9824



In [30]:
print(round(accuracy_score(test_labels, test_pred_labels),3))

0.79


У авторів оригінальної статті було 78.2, у мене вийшло 79 - тобто вдалось повторити і навіть трохи покращити результат, використовуючи тільки методи класичного машинного навчання.