In [1]:
filename = 'data/nyt2000-sents.jsonl'
train_fn = filename.replace('.jsonl', '.train.jsonl')
dev_fn = filename.replace('.jsonl', '.dev.jsonl')

test_fn = 'data/run-on-test.json'

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

from jsonlines import jsonlines
import json
from tqdm.auto import tqdm

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
def read_x_y_jsonl(filename, max_num=1000000):
    buffer = []
    with jsonlines.open(filename) as reader:
        for sentence in reader:
            buffer.append(sentence)
            if len(buffer)>max_num:
                break
    x, y = [], []
    for sent in buffer:
        x.append([x for x, y in sent])
        y.append([y for x, y in sent])
    return x, y

In [5]:
x_train_tokens, y_train = read_x_y_jsonl(train_fn, 10000)
print(x_train_tokens[:2])
print(y_train[:2])

[['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.'], ['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 'races', 'turning', 'the', 'convention', 'stage', 'into', 'Mr.', 'Bush', "'s", 'Neighborhood', ',', 'and', 'lines', 'like', '"', 'We', 'Shall', 'Overcome', '"', 'in', 'his', 'speech', ',', 'Mr.', 'Bush', 'left', 'moderates', 'feeling', 'as', 'if', 'he', 'had', 'unleashed', 'the', 'party', "'s", 'inner', 'Connecticut', 'Yankee', '.']]
[[False, False, False, False, False, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fa

In [6]:
x_dev_tokens, y_dev = read_x_y_jsonl(dev_fn, 10000)

In [7]:
def read_x_y_json(filename):
    with open(filename, 'rt', encoding='utf-8') as f:
        js = json.loads(f.read())
    tokens, flags = [], []
    for sent in js:
        ts, fs = [], []
        for t, f in sent:
            ts.append(t)
            fs.append(f)
        tokens.append(ts)
        flags.append(fs)
    return tokens, flags

In [8]:
x_test_tokens, y_test = read_x_y_json(test_fn)

In [9]:
print(x_test_tokens[:2])
print(y_test[:2])

[['I', 'think', 'the', 'magnitude', 'of', 'a', 'benefit', 'and', 'error', 'rates', 'that', 'were', 'chosen', 'were', 'reasonable', 'They', 'were', 'standard', 'from', 'our', 'learning', '.'], ['Economists', 'on', 'both', 'the', 'left', 'and', 'right', 'broadly', 'agree', 'that', 'the', 'need', 'for', 'stimulative', 'government', 'spending', 'is', 'necessary', 'to', 'prevent', 'a', 'further', 'collapse', 'of', 'the', 'global', 'economic', 'system', '-', 'just', 'as', 'the', 'New', 'Deal', 'and', 'the', 'deficit', 'spending', 'of', 'World', 'War', 'II', 'restored', 'the', 'health', 'of', 'the', 'global', 'economy', 'in', 'the', 'last', 'century', '.']]
[[False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False], [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fa

In [10]:
def word2features(tokens, i):
    word = tokens[i]
    # print(word)
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-1:]': word[-1:],
        'word[:-3]': word[:-3],
        'word[:-2]': word[:-2],
        'word[:-1]': word[:-1],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'i': i,
        '~i': len(tokens)-i
    }
    if i > 0:
        word1 = tokens[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i > 1:
        word2 = tokens[i-2]
        features.update({
            '-2:word.lower()': word2.lower(),
            '-2:word.istitle()': word2.istitle(),
            '-2:word.isupper()': word2.isupper(),
        })
    else:
        features['BOS2'] = True

        
    if i < len(tokens)-1:
        word1 = tokens[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    if i < len(tokens)-2:
        word2 = tokens[i+2]
        features.update({
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
        })
    else:
        features['EOS2'] = True
    
        
    return features

In [11]:
def tokens2features(sentences):
    return [[word2features(tokens, i) for i in range(len(tokens))] for tokens in sentences]

def flat_list(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

In [12]:
tokens2features(x_train_tokens)[0]

[{'bias': 1.0,
  'word.lower()': 'i',
  'word[-3:]': 'I',
  'word[-2:]': 'I',
  'word[-1:]': 'I',
  'word[:-3]': '',
  'word[:-2]': '',
  'word[:-1]': '',
  'word.isupper()': True,
  'word.istitle()': True,
  'word.isdigit()': False,
  'i': 0,
  '~i': 12,
  'BOS': True,
  'BOS2': True,
  '+1:word.lower()': 'was',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+2:word.lower()': 'told',
  '+2:word.istitle()': False,
  '+2:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'was',
  'word[-3:]': 'was',
  'word[-2:]': 'as',
  'word[-1:]': 's',
  'word[:-3]': '',
  'word[:-2]': 'w',
  'word[:-1]': 'wa',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'i': 1,
  '~i': 11,
  '-1:word.lower()': 'i',
  '-1:word.istitle()': True,
  '-1:word.isupper()': True,
  'BOS2': True,
  '+1:word.lower()': 'told',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+2:word.lower()': 'that',
  '+2:word.istitle()': False,
  '+2:word.isupper()'

In [13]:
x_train = tokens2features(x_train_tokens)
x_dev = tokens2features(x_dev_tokens)
x_test = tokens2features(x_test_tokens)

In [14]:
print(len(x_train[3]))
print(len(y_train[3]))

74
74


In [15]:
vectorizer = DictVectorizer()
x_train_features = vectorizer.fit_transform(flat_list(x_train))
x_dev_features = vectorizer.transform(flat_list(x_dev))
x_test_features = vectorizer.transform(flat_list(x_test))

y_train_flat = flat_list(y_train)
y_dev_flat = flat_list(y_dev)
y_test_flat = flat_list(y_test)

In [16]:
print(len(flat_list(x_train)))
print(x_train_features.shape)
print(len(flat_list(x_dev)))
print(x_dev_features.shape)
print(len(flat_list(x_test)))
print(x_test_features.shape)

536086
(536086, 275590)
529812
(529812, 275590)
4697
(4697, 275590)


In [60]:
normalizer = StandardScaler(with_mean=False)
x_train_norm = normalizer.fit_transform(x_train_features)
x_dev_norm = normalizer.transform(x_dev_features)
x_test_norm = normalizer.transform(x_test_features)

In [67]:
#classifier = RandomForestClassifier()

classifier = LogisticRegression(C=1.5, verbose=1, max_iter=200,
                               # class_weight={True:100, False:1},
                                solver='liblinear',
                                penalty='l1',
                                n_jobs=-1
                               )

# classifier = svm.SVC(verbose=1, max_iter=1000)

classifier.fit(x_train_features, y_train_flat)
#classifier.fit(x_train_norm, y_train_flat)

[LibLinear]

LogisticRegression(C=1.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='warn', n_jobs=-1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)

In [68]:
y_dev_pred = classifier.predict(x_dev_features)
# y_dev_pred = classifier.predict(x_dev_norm)

In [69]:
from sklearn.metrics import classification_report
print(classification_report(
    y_dev_flat, y_dev_pred, digits=3
))

              precision    recall  f1-score   support

       False      0.987     0.996     0.992    517290
        True      0.750     0.451     0.563     12522

   micro avg      0.983     0.983     0.983    529812
   macro avg      0.869     0.724     0.777    529812
weighted avg      0.981     0.983     0.981    529812



In [94]:
y_test_pred = classifier.predict(x_test_features)

In [95]:
print(classification_report(
    y_test_flat, y_test_pred, digits=3
))

              precision    recall  f1-score   support

       False      0.986     0.993     0.989      4542
        True      0.740     0.587     0.655       155

   micro avg      0.980     0.980     0.980      4697
   macro avg      0.863     0.790     0.822      4697
weighted avg      0.978     0.980     0.978      4697



In [96]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
tp, fp, fn = 0, 0, 0
for y, p in zip(y_test_flat, y_test_pred):
    if y and p:
        tp +=1
    if y and not p:
        fn +=1
    if not y and p:
        fp +=1
print(f"TP={tp}, FP={fp}, FN={fn}")

TP=91, FP=32, FN=64


In [82]:
# vectorizer.feature_names_
# sorted(list(zip(classifier.feature_importances_, vectorizer.feature_names_)), reverse=True)
imports = sorted(list(zip(classifier.coef_[0], vectorizer.feature_names_)), reverse=True)
print(imports[:10])
print(imports[-10:])
#classifier.coef_

[(4.509891708826173, '+1:word.lower()=although'), (4.29019407877335, '+1:word.lower()=but'), (4.250329338176813, '+1:word.lower()=loving'), (4.113101450493355, '+1:word.lower()=we'), (4.088086155449225, '+1:word.lower()=*'), (4.080044633787801, '+1:word.lower()=according'), (4.072726233482204, '+1:word.lower()=though'), (4.018440836789848, '+1:word.lower()=beloved'), (3.928450894016356, 'word[-1:]=F'), (3.713092189473688, '+1:word.lower()=however')]
[(-2.6202645033374563, '+1:word.lower()=street'), (-2.780793883638997, 'BOS'), (-2.796113985900529, '+1:word.lower()=-'), (-3.070500516760489, '-1:word.lower()="'), (-3.1432129540977556, "+1:word.lower()='s"), (-3.488242706292845, 'word[-1:]=.'), (-3.6188531030648043, '+1:word.lower()=,'), (-4.502066400156557, 'word.lower()=and'), (-4.663049237477228, '+1:word.lower()=.'), (-4.762047274404499, 'EOS')]


In [83]:
from collections import defaultdict

dev_start_tokens = defaultdict(list)
for i, s_dev in enumerate(x_dev_tokens):
    dev_start_tokens[s_dev[0]].append(i)

for s_train in x_train_tokens[:2]:
    susp_indices = dev_start_tokens[s_train[0]]
    if susp_indices:
        for i in susp_indices:
            if [i for i, j in zip(s_train, x_dev_tokens[i]) if i!=j]:
                print(s_train)

['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']

['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']

['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']

['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']
['I', 'was', 'told', 'that', 'John', 'mixed', 'his', 'last', 'drink', 'last', 'year', '.']

['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 'races', 'turning', 'the', 'convention', 'stage', 'into', 'Mr.', 'Bush', "'s", 'Neighborhood', ',', 'and', 'lines', 'like', '"', 'We', 'Shall', 'Overcome', '"', 'in', 'his', 'speech', ',', 'Mr.', 'Bush', 'left', 'moderates', 'feeling', 'as', 'if', 'he', 'had', 'unleashed', 'the', 'party', "'s", 'inner', 'Connecticut', 'Yankee', '.']
['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 'races', 'turning', 'the', 'convention', 'stage', 'into', 'Mr.', 'Bush', "'s", 'Neighborhood', ',', 'and', 'lines', 'like', '"', 'We', 'Shall', 'Overcome', '"', 'in', 'his', 'speech', ',', 'Mr.', 'Bush', 'left', 'moderates', 'feeling', 'as', 'if', 'he', 'had', 'unleashed', 'the', 'party', "'s", 'inner', 'Connecticut', 'Yankee', '.']
['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 

['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 'races', 'turning', 'the', 'convention', 'stage', 'into', 'Mr.', 'Bush', "'s", 'Neighborhood', ',', 'and', 'lines', 'like', '"', 'We', 'Shall', 'Overcome', '"', 'in', 'his', 'speech', ',', 'Mr.', 'Bush', 'left', 'moderates', 'feeling', 'as', 'if', 'he', 'had', 'unleashed', 'the', 'party', "'s", 'inner', 'Connecticut', 'Yankee', '.']
['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 'races', 'turning', 'the', 'convention', 'stage', 'into', 'Mr.', 'Bush', "'s", 'Neighborhood', ',', 'and', 'lines', 'like', '"', 'We', 'Shall', 'Overcome', '"', 'in', 'his', 'speech', ',', 'Mr.', 'Bush', 'left', 'moderates', 'feeling', 'as', 'if', 'he', 'had', 'unleashed', 'the', 'party', "'s", 'inner', 'Connecticut', 'Yankee', '.']
['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 

['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 'races', 'turning', 'the', 'convention', 'stage', 'into', 'Mr.', 'Bush', "'s", 'Neighborhood', ',', 'and', 'lines', 'like', '"', 'We', 'Shall', 'Overcome', '"', 'in', 'his', 'speech', ',', 'Mr.', 'Bush', 'left', 'moderates', 'feeling', 'as', 'if', 'he', 'had', 'unleashed', 'the', 'party', "'s", 'inner', 'Connecticut', 'Yankee', '.']
['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 'races', 'turning', 'the', 'convention', 'stage', 'into', 'Mr.', 'Bush', "'s", 'Neighborhood', ',', 'and', 'lines', 'like', '"', 'We', 'Shall', 'Overcome', '"', 'in', 'his', 'speech', ',', 'Mr.', 'Bush', 'left', 'moderates', 'feeling', 'as', 'if', 'he', 'had', 'unleashed', 'the', 'party', "'s", 'inner', 'Connecticut', 'Yankee', '.']
['With', 'the', 'theme', 'of', '"', 'Leave', 'No', 'Child', 'Behind', ',', '"', 'and', 'people', 'of', 'all', 