In [1]:
filename = 'data/nyt2000-sents.jsonl'
train_fn = filename.replace('.jsonl', '.train.jsonl')
dev_fn = filename.replace('.jsonl', '.dev.jsonl')

test_fn = 'data/run-on-test.json'

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.feature_selection import SelectPercentile

from itertools import chain

import nltk
import spacy
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

from jsonlines import jsonlines
import json
from tqdm.auto import tqdm

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
print("Load spacy...")
nlp = spacy.load("en_core_web_lg")
#nlp.remove_pipe('ner')
#nlp.remove_pipe('tagger')
print("...Done!")

Load spacy...
...Done!


In [5]:
def read_x_y_jsonl(filename, max_num=1000000):
    buffer = []
    with jsonlines.open(filename) as reader:
        for sentence in reader:
            buffer.append(sentence)
            if len(buffer)>max_num:
                break
    x, y = [], []
    for sent in buffer:
        x.append([x for x, y in sent])
        y.append([y for x, y in sent])
    return x, y

Prepared train and dev datasets are too long for my notebook, so I use reduced ones (the third parameter in read_x_y_jsonl function).

In [6]:
x_train_tokens, y_train = read_x_y_jsonl(train_fn, 50000)
print(x_train_tokens[:2])
print(y_train[:2])

[['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.'], ['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 'races', 'turning', 'the', 'convention', 'stage', 'into', 'Mr.', 'Bush', "'s", 'Neighborhood', ',', 'and', 'lines', 'like', '"', 'We', 'Shall', 'Overcome', '"', 'in', 'his', 'speech', ',', 'Mr.', 'Bush', 'left', 'moderates', 'feeling', 'as', 'if', 'he', 'had', 'unleashed', 'the', 'party', "'s", 'inner', 'Connecticut', 'Yankee', '.']]
[[False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fa

In [7]:
x_dev_tokens, y_dev = read_x_y_jsonl(dev_fn, 50000)

In [8]:
def read_x_y_json(filename):
    with open(filename, 'rt', encoding='utf-8') as f:
        js = json.loads(f.read())
    tokens, flags = [], []
    for sent in js:
        ts, fs = [], []
        for t, f in sent:
            ts.append(t)
            fs.append(f)
        tokens.append(ts)
        flags.append(fs)
    return tokens, flags

In [9]:
x_test_tokens, y_test = read_x_y_json(test_fn)

In [10]:
print(x_test_tokens[:2])
print(y_test[:2])

[['I', 'think', 'the', 'magnitude', 'of', 'a', 'benefit', 'and', 'error', 'rates', 'that', 'were', 'chosen', 'were', 'reasonable', 'They', 'were', 'standard', 'from', 'our', 'learning', '.'], ['Economists', 'on', 'both', 'the', 'left', 'and', 'right', 'broadly', 'agree', 'that', 'the', 'need', 'for', 'stimulative', 'government', 'spending', 'is', 'necessary', 'to', 'prevent', 'a', 'further', 'collapse', 'of', 'the', 'global', 'economic', 'system', '-', 'just', 'as', 'the', 'New', 'Deal', 'and', 'the', 'deficit', 'spending', 'of', 'World', 'War', 'II', 'restored', 'the', 'health', 'of', 'the', 'global', 'economy', 'in', 'the', 'last', 'century', '.']]
[[False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fa

In [11]:
def load_colloc_bigrams():
    bi_name = 'data/bigrams.jsonl'
    with jsonlines.open(bi_name, 'r') as r:
        res = {}
        for line in r:
            res[line[0]]=line[1]
    return res

In [12]:
bigrams = load_colloc_bigrams()

## Named entities as features
It was not useful (difference was in the third digit after decimal point), but interesting.

In [13]:
def retrieve_entity_features(i: int, sentence: spacy.tokens.Doc, feature_prefix=''):
    d = {}
    for e in sentence.ents:
        d.update({t.i:t for t in e})
    t = d.get(i, None)
    if t:
        return {feature_prefix+'ent_type':t.ent_type_, feature_prefix+'ent_iob':t.ent_iob_}
    return {}
    print(d)
    
_doc = nlp("San Francisco considers banning sidewalk delivery robots")
print(retrieve_entity_features(0, _doc))
print(retrieve_entity_features(1, _doc, '+1:'))
print(retrieve_entity_features(2, _doc))
del(_doc)

{'ent_type': 'GPE', 'ent_iob': 'B'}
{'+1:ent_type': 'GPE', '+1:ent_iob': 'I'}
{}


## Features construction:

In [14]:
def word2features(tokens, i, sentence: spacy.tokens.Doc):
    word = tokens[i]    
    # print(word)
    features = {
        #'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-1:]': word[-1:],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[:1]': word[:1],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'i': i,
        '~i': len(tokens)-i,
        'word.lemma': sentence[i].lemma_,
        'pos': sentence[i].pos_,
        'dep': sentence[i].dep_,
        'n_lefts': sentence[i].n_lefts,
        'n_rights': sentence[i].n_rights,
    }
    #features.update(retrieve_entity_features(i, sentence))
    
    if i > 0:
        word1 = tokens[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.lemma': sentence[i-1].lemma_,
            '-1:pos': sentence[i-1].pos_,
            '-1:dep': sentence[i-1].dep_,
        })
        #features.update(retrieve_entity_features(i-1, sentence, '-1'))
    else:
        features['BOS'] = True

    if i > 1:
        word2 = tokens[i-2]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
            '-2:word.lemma': sentence[i-2].lemma_,
            '-2:pos': sentence[i-2].pos_,
            '-2:dep': sentence[i-2].dep_,
        })
        #features.update(retrieve_entity_features(i-2, sentence, '-2'))
    else:
        features['BOS2'] = True

        
    if i < len(tokens)-1:
        word1 = tokens[i+1]
        bigram_key = word.lower()+'_'+word1.lower()
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.lemma': sentence[i+1].lemma_,
            '+1:pos': sentence[i+1].pos_,
            '+1:dep': sentence[i+1].dep_,
            'has_bigram20': bigrams.get(bigram_key, 0)>20,
            'has_bigram5': bigrams.get(bigram_key, 0)>5,
            'has_bigram1': bigrams.get(bigram_key, 0)>1,
            'bigram_counter': bigrams.get(bigram_key, 0),
        })
        #features.update(retrieve_entity_features(i+1, sentence, '+1'))
    else:
        features['EOS'] = True

    if i < len(tokens)-2:
        word2 = tokens[i+2]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:word.lemma': sentence[i+2].lemma_,
            '+2:pos': sentence[i+2].pos_,
            '+2:dep': sentence[i+2].dep_,
        })
        #features.update(retrieve_entity_features(i+2, sentence, '+2'))
    else:
        features['EOS2'] = True
    
    return features

In [15]:
def tokens2features(sentences):
    res = []
    for sent in tqdm(sentences, total=len(sentences)):
        doc = nlp(' '.join(sent))
        res.append([word2features(sent, i, doc) for i in range(len(sent))])
    return res

def flat_list(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

In [16]:
tokens2features(x_train_tokens[:10])

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




[[{'word.lower()': 'i',
   'word[-3:]': 'I',
   'word[-2:]': 'I',
   'word[-1:]': 'I',
   'word[:3]': 'I',
   'word[:2]': 'I',
   'word[:1]': 'I',
   'word.isupper()': True,
   'word.istitle()': True,
   'word.isdigit()': False,
   'i': 0,
   '~i': 12,
   'word.lemma': '-PRON-',
   'pos': 'PRON',
   'dep': 'nsubjpass',
   'n_lefts': 0,
   'n_rights': 0,
   'BOS': True,
   'BOS2': True,
   '+1:word.lower()': 'was',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:word.lemma': 'be',
   '+1:pos': 'VERB',
   '+1:dep': 'auxpass',
   'has_bigram20': True,
   'has_bigram5': True,
   'has_bigram1': True,
   'bigram_counter': 11665,
   '+2:word.lower()': 'told',
   '+2:word.istitle()': False,
   '+2:word.isupper()': False,
   '+2:word.lemma': 'tell',
   '+2:pos': 'VERB',
   '+2:dep': 'ROOT'},
  {'word.lower()': 'was',
   'word[-3:]': 'was',
   'word[-2:]': 'as',
   'word[-1:]': 's',
   'word[:3]': 'was',
   'word[:2]': 'wa',
   'word[:1]': 'w',
   'word.isupper()': False,
  

In [17]:
x_train = tokens2features(x_train_tokens)
x_dev = tokens2features(x_dev_tokens)
x_test = tokens2features(x_test_tokens)

HBox(children=(IntProgress(value=0, max=50001), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50001), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [18]:
print(len(x_train[3]))
print(len(y_train[3]))

74
74


In [19]:
vectorizer = DictVectorizer()

x_train_features = vectorizer.fit_transform(flat_list(x_train))
x_dev_features = vectorizer.transform(flat_list(x_dev))
x_test_features = vectorizer.transform(flat_list(x_test))



y_train_flat = flat_list(y_train)
y_dev_flat = flat_list(y_dev)
y_test_flat = flat_list(y_test)



Strip to rare features. It adds more stability to our results.

In [20]:
selector = SelectPercentile(percentile=50)
selector.fit(x_train_features, y_train_flat)

SelectPercentile(percentile=50,
         score_func=<function f_classif at 0x000001E650E31048>)

In [21]:
print(len(flat_list(x_train)))
print(x_train_features.shape)
print(selector.transform(x_train_features).shape)
print(len(flat_list(x_dev)))
print(x_dev_features.shape)
print(len(flat_list(x_test)))
print(x_test_features.shape)

2675168
(2675168, 818812)
(2675168, 409406)
2704413
(2704413, 818812)
4697
(4697, 818812)


In [22]:
# normalizer = StandardScaler(with_mean=False)
# x_train_norm = normalizer.fit_transform(x_train_features)
# x_dev_norm = normalizer.transform(x_dev_features)
# x_test_norm = normalizer.transform(x_test_features)

In [23]:
# classifier = RandomForestClassifier()

classifier = LogisticRegression(C=1.5, verbose=1, max_iter=200,
                                class_weight={True:1.5, False:1},
                                #class_weight='balanced',
                                solver='liblinear',
                                #solver='newton-cg',
                                #penalty='l2',
                                penalty='l1',
                                n_jobs=None
                               )

# classifier = svm.SVC(verbose=1, max_iter=1000)

classifier.fit(selector.transform(x_train_features), y_train_flat)
#classifier.fit(x_train_norm, y_train_flat)

[LibLinear]

LogisticRegression(C=1.5, class_weight={True: 1.5, False: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=200,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=1, warm_start=False)

In [24]:
y_dev_pred = classifier.predict(selector.transform(x_dev_features))
# y_dev_pred = classifier.predict(x_dev_norm)

## Dev set metrics

In [25]:
from sklearn.metrics import classification_report
print(classification_report(
    y_dev_flat, y_dev_pred, digits=3
))

              precision    recall  f1-score   support

       False      0.992     0.996     0.994   2639384
        True      0.795     0.693     0.740     65029

   micro avg      0.988     0.988     0.988   2704413
   macro avg      0.894     0.844     0.867   2704413
weighted avg      0.988     0.988     0.988   2704413



In [26]:
y_test_pred = classifier.predict(selector.transform(x_test_features))

## Test set metrics

In [27]:
print(classification_report(
    y_test_flat, y_test_pred, digits=3
))

              precision    recall  f1-score   support

       False      0.992     0.988     0.990      4542
        True      0.676     0.755     0.713       155

   micro avg      0.980     0.980     0.980      4697
   macro avg      0.834     0.871     0.852      4697
weighted avg      0.981     0.980     0.981      4697



In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
tp, fp, fn = 0, 0, 0
for y, p in zip(y_test_flat, y_test_pred):
    if y and p:
        tp +=1
    if y and not p:
        fn +=1
    if not y and p:
        fp +=1
print(f"TP={tp}, FP={fp}, FN={fn}")

TP=117, FP=56, FN=38


## Top 10 positive and negative features

In [29]:
# vectorizer.feature_names_
# sorted(list(zip(classifier.feature_importances_, vectorizer.feature_names_)), reverse=True)
imports = sorted(list(zip(classifier.coef_[0], vectorizer.feature_names_)), reverse=True)
print(imports[:10])
print(imports[-10:])
#classifier.coef_

[(14.075966362203173, '+1:word.lemma=Mezzanote'), (11.129290382924376, '+1:word.lower()=balducci'), (10.67547239984724, '+1:word.lemma=Mask'), (10.657706983101491, '-1:word.lower()=.3:09:34'), (10.466405182680885, '+1:word.lemma=Milane'), (10.18788869898681, '+1:word.lemma=Other'), (10.040133230527486, '+1:word.lemma=MicroTherm'), (9.996631152650615, '+1:word.lemma=Nieto'), (9.749083975312871, '+1:word.lemma=Rahlves'), (9.578070265724287, '+2:word.lower()=gosto')]
[(-5.463148928322413, '+2:word.lower()=congested'), (-5.4783758334239785, '+2:word.lemma=unborn'), (-5.683289053495804, '-1:word.lemma=subzero'), (-5.729434588738841, '+1:word.lemma=deathbed'), (-6.959003813785296, '-1:word.lemma=Sts'), (-7.436974673630687, '-1:word.lemma=voyage'), (-7.617304668787699, '+1:word.lower()=modification'), (-7.758571901777191, '-1:word.lemma=ganache'), (-8.984993063711366, '-1:word.lemma=Voice'), (-13.696299500682752, '-1:word.lemma=Bindery')]
