In [3]:
import re
import math
import json
import random

import sentence_utils as ut

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import spacy
# from spacy.tokenizer import Tokenizer

In [4]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [5]:
nlp = spacy.load('en_core_web_md', disable=['tagger'])

In [6]:
test_path = "../../../../tasks/07-language-as-sequence/run-on-test.json"
test_X = []
test_y = []
with open(test_path) as input_f:
    results = json.loads(input_f.read())
    for result in results:
        X = [s[0] for s in result]
        y = [s[1] for s in result]
        test_X.append(X)
        test_y.append(y)

In [7]:
input_file = zipfile.ZipFile('./data.zip')

def read_lines(file):
    with input_file.open(file) as inp:
        return inp.readlines()

In [8]:
def normalize_trailing_fullstops(sentence):
    sentence = sentence.strip()
    if re.search("[?!\.]\s?$", sentence, re.MULTILINE):
        return sentence
    return re.sub("\.?\s?$", ".", sentence, re.MULTILINE)

In [9]:
enrn_sentences = read_lines('enron.txt')

In [10]:
enrn_sentences_normalized = [normalize_trailing_fullstops(sent) for sent in enrn_sentences]

In [11]:
news_sentences = read_lines('newsgroup.txt')

In [12]:
news_sentences_normalized = [normalize_trailing_fullstops(sent) for sent in news_sentences]

In [13]:
wiki_sentences = read_lines('wikipedia.txt')

In [14]:
wiki_sentences_normalized = [normalize_trailing_fullstops(sent) for sent in wiki_sentences]

In [15]:
def is_finishing_punctutation(char):
    return char in ['.', '..', '...', '?', '!']
def remove_trailing_punctutation(pair):
    tok_sequence, label_sequence = pair
    if is_finishing_punctutation(tok_sequence[-1]):
        remove_count = 0
        for i in range(-1, -len(tok_sequence), -1):
            if is_finishing_punctutation(tok_sequence[i]):
                remove_count += 1
            else:
                break
        for i in range(remove_count):
            tok_sequence.pop()
            label_sequence.pop()
    return (tok_sequence, label_sequence)

## Let's try to build simple baseline, but first need to add some positive cases

In [16]:
def add_positive_cases(Xs, ys, ratio=0.4):
    """Merging randomly sampled sentences and adding positive cases"""
    newXs = []
    newYs = []
    for i in range(math.floor(len(Xs) * ratio)):
        first_id = random.randint(0, len(Xs) - 1)
        second_id = random.randint(0, len(Xs) - 1)
        while first_id == second_id:
            second_id = random.randint(0, len(Xs) - 1)
        first_X, first_y = Xs[first_id].copy(), ys[first_id].copy()
        second_X, second_y = Xs[second_id].copy(), ys[second_id].copy()
        first_X, first_y = remove_trailing_punctutation((first_X, first_y))
        first_y[-1] = True
        newXs.append(first_X + second_X)
        newYs.append(first_y + second_y)
    return Xs + newXs, ys + newYs

def augment_partially(Xs, ys):
    """Idea here is to do some augmentations on Xs like changing names / exclamations / farewells"""
    newXs = []
    newYs = []
    return Xs + newXs, ys + newYs

def create_dataset(dataset_sentences, positive_cases_ration):
    raw_pairs = [ut.sentence_to_sequences(s) for s in dataset_sentences]
    [Xs, ys] = zip(*raw_pairs)
    Xs = list(Xs)
    ys = list(ys)
    Xs, ys = add_positive_cases(Xs, ys, positive_cases_ration)
    Xs, ys = augment_partially(Xs, ys)
    return Xs, ys

Xs, ys = create_dataset(enrn_sentences_normalized, 0.4)
len(Xs)

489370

In [15]:
Xs[0:3]

[['Here', 'is', 'our', 'forecast', '.'],
 ['test', 'successful', '.'],
 ['way', 'to', 'go', '!', '!']]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=0.4, random_state=42)

In [17]:
class BaselinePredictor:
    def _predict_single(self, x):
        res = [False]
        for (prev_tok, tok) in zip(x, x[1:]):
            if prev_tok and tok[0].isupper() and not is_finishing_punctutation(prev_tok):
                res.append(True)
            else:
                res.append(False)
        return res
    
    def predict(self, Xs):
        return [self._predict_single(x) for x in Xs]
        
baseline = BaselinePredictor()

In [18]:
y_pred = baseline.predict(Xs)

In [19]:
print(classification_report(flatten(ys), flatten(y_pred)))

             precision    recall  f1-score   support

      False       0.98      0.82      0.90   8050555
       True       0.02      0.19      0.03    139820

avg / total       0.97      0.81      0.88   8190375



```
 precision    recall  f1-score   support

      False       0.98      0.82      0.90   8050279
       True       0.02      0.19      0.03    139820

avg / total       0.97      0.81      0.88   8190099
```

# well, given we are thinking about using different datasets, might as well try running baseline on test set

In [20]:
y_pred = baseline.predict(test_X)
print(classification_report(flatten(test_y), flatten(y_pred)))

             precision    recall  f1-score   support

      False       0.97      0.93      0.95      4542
       True       0.03      0.06      0.04       155

avg / total       0.94      0.91      0.92      4697



```
             precision    recall  f1-score   support

      False       0.97      0.93      0.95      4542
       True       0.03      0.06      0.04       155

avg / total       0.94      0.91      0.92      4697
```

it has slightly more precision but multiple times lower recall for minor class. 
Let's now actually build some predictor, a simple one using subset of data

In [23]:
def is_quote(word):
    return re.match("[\"\'«»”“]", word) != None

In [22]:
doc = ut.toks_to_spacy(nlp, ut.sentence_to_tokens("Bill is really nice grello"))
for (i, tok) in enumerate(doc):
    print(tok.like_email)

False
False
False
False
False


In [24]:
def single_tok_to_feature(label, tok):
    if not tok:
        return {
            label + '_shape': 'NONE',
            label + '_is_quote': False,
            label + '_is_punct': False,
            label + '_is_start': False,
            label + '_in_vocab': False,
            label + '_like_num': False,
            label + '_like_url': False,
            label + '_like_email': False,
            label + '_ent_type': 'NONE',
        }
    return {
        label + '_shape': tok.shape_[0:4],
        label + '_is_quote': is_quote(str(tok)),
        label + '_is_punct': tok.is_punct,
        label + '_is_start': tok.is_sent_start,
        label + '_in_vocab': str(tok) in nlp.vocab,
        label + '_like_num': tok.like_num,
        label + '_like_url': tok.like_url,
        label + '_like_email': tok.like_email,
        label + '_ent_type': tok.ent_type_
        
    }
def to_features(word, prev_1, next_1): # prev_2, prev_3, next_2, next_3
    return {
        **single_tok_to_feature('word', word),
        **single_tok_to_feature('prev_1', prev_1),
        **single_tok_to_feature('next_1', next_1),
    }
def single_doc_to_features(sentence):
    sent = [tok for tok in sentence if tok]
    doc = ut.toks_to_spacy(nlp, sent)
    res = []
    for (i, token) in enumerate(doc):
        next_w = doc[i+1] if i + 1 < len(doc) else None
        prev_w = doc[i-1] if i - 1 >= 0 else None
        res.append(to_features(token, prev_w, next_w))
    return res
def vectorize(dataset):
    res = []
    for sent in tqdm(dataset):
        res += single_doc_to_features(sent)
    return res

In [25]:
def fill_nans(sparse_matrix):
    sparse_matrix.data = np.nan_to_num(sparse_matrix.data)
    return sparse_matrix

In [154]:
# dict_vectorizer = DictVectorizer()
vec_train_X = vectorize(X_train)
vec_test_X = vectorize(X_test)

100%|██████████| 293622/293622 [39:58<00:00, 123.97it/s]
100%|██████████| 195748/195748 [26:35<00:00, 122.65it/s]


In [156]:
dict_vectorizer = DictVectorizer()
train_features_X = fill_nans(dict_vectorizer.fit_transform(vec_train_X))
test_features_X = fill_nans(dict_vectorizer.transform(vec_test_X))

In [157]:
predictor = LogisticRegression().fit(X=train_features_X, y=flatten(y_train))

In [158]:
print(classification_report(predictor.predict(train_features_X), flatten(y_train)))

             precision    recall  f1-score   support

      False       1.00      0.99      0.99   4894548
       True       0.29      0.76      0.42     31865

avg / total       0.99      0.99      0.99   4926413



In [160]:
print(classification_report(predictor.predict(test_features_X), flatten(y_test)))

             precision    recall  f1-score   support

      False       1.00      0.99      0.99   3242663
       True       0.29      0.76      0.42     21299

avg / total       0.99      0.99      0.99   3263962



Let's see how it helps with gold test, even though it's kinda breaking the rules, but might be we overfitted on what we have in train/test here

In [161]:
y_test_pred = predictor.predict(fill_nans(dict_vectorizer.transform(vectorize(test_X))))

100%|██████████| 200/200 [00:02<00:00, 70.53it/s]


In [164]:
print(classification_report(y_test_pred, flatten(test_y)))

             precision    recall  f1-score   support

      False       1.00      0.98      0.99      4643
       True       0.32      0.93      0.48        54

avg / total       0.99      0.98      0.98      4697



Results seems way better. Now I'm gonna to change code in some of the functions used for creating / augmenting dataset in order to have more variation and actually less data. I assume it works that well because of spacy's `is_sent_start` attribute provided for tokens

In [106]:
def add_positive_cases(Xs, ys, ratio=0.4, mergeThreeRatio=0.3, keep_ratio=0.05):
    """Merging randomly sampled sentences and adding positive cases"""
    newXs = []
    newYs = []
    for i in range(math.floor(len(Xs) * ratio)):
        first_id = random.randint(0, len(Xs) - 1)
        second_id = random.randint(0, len(Xs) - 1)
        while first_id == second_id:
            second_id = random.randint(0, len(Xs) - 1)
        first_X, first_y = Xs[first_id].copy(), ys[first_id].copy()
        second_X, second_y = Xs[second_id].copy(), ys[second_id].copy()
        first_X, first_y = remove_trailing_punctutation((first_X, first_y))
        first_y[-1] = True
        newXs.append(first_X + second_X)
        newYs.append(first_y + second_y)
    for i in range(math.floor(len(Xs) * mergeThreeRatio)):
        first_id = random.randint(0, len(Xs) - 1)
        second_id = random.randint(0, len(Xs) - 1)
        while first_id == second_id:
            second_id = random.randint(0, len(Xs) - 1)
        third_id = random.randint(0, len(Xs) - 1)
        while third_id == first_id or third_id == second_id:
            third_id = random.randint(0, len(Xs) - 1)
            
        first_X, first_y = Xs[first_id].copy(), ys[first_id].copy()
        second_X, second_y = Xs[second_id].copy(), ys[second_id].copy()
        third_X, third_y = Xs[third_id].copy(), ys[third_id].copy()
        first_X, first_y = remove_trailing_punctutation((first_X, first_y))
        first_y[-1] = True
        second_X, second_y = remove_trailing_punctutation((first_X, first_y))
        second_y[-1] = True
        newXs.append(first_X + second_X + third_X)
        newYs.append(first_y + second_y + third_y)
    for i in range(math.floor(len(Xs) * keep_ratio)):
        index = random.randint(0, len(Xs) - 1)
        newXs.append(Xs[index].copy())
        newYs.append(ys[index].copy())
    return newXs, newYs

def replace_punct_tok(token):
    if not is_finishing_punctutation(token):
        return token
    return random.choice(set(['.', '..', '...', '?', '!']) - set(token))

def augment_partially(Xs, ys, ratio = 0.2):
    """Idea here was to do some augmentations on Xs like changing ending punctutation / names / farewells"""
    newXs = []
    newYs = []
    subset_ratio = 0.4
    subset_with_puncutation = []
    k = 0
    while len(subset_with_puncutation) < math.floor(len(Xs) * subset_ratio) and k < len(Xs):
        if any([is_finishing_punctutation(tok) for tok in Xs[k]]):
            subset_with_puncutation.append(k)
        k += 1
    for i in range(math.floor(len(Xs) * ratio)):
        random_index = random.choice(subset_with_puncutation)
        newX = [replace_punct_tok(x) for x in Xs[random_index].copy()]
        newXs.append(newX)
        newYs.append(ys[random_index].copy())
    return Xs + newXs, ys + newYs

In [20]:
Xs, ys = create_dataset(enrn_sentences_normalized + wiki_sentences_normalized + news_sentences, 0.4)
len(Xs)

413622

Well, my kernel has died on me, when I tried running vectorize on whole dataset after crunching through data for an hour, so I actually will skip experimenting with whole dataset and go to step 3 I planned for – having smaller subset but trying more agressively to add positive cases / other features

In [51]:
X_train, X_test, y_train, y_test = train_test_split(Xs[0:50000], ys[0:50000], test_size=0.3, random_state=42)

In [52]:
vec_train_X = vectorize(X_train)
vec_test_X = vectorize(X_test)
dict_vectorizer = DictVectorizer()
train_features_X = fill_nans(dict_vectorizer.fit_transform(vec_train_X))
test_features_X = fill_nans(dict_vectorizer.transform(vec_test_X))
predictor = LogisticRegression().fit(X=train_features_X, y=flatten(y_train))

100%|██████████| 35000/35000 [06:52<00:00, 84.78it/s] 
100%|██████████| 15000/15000 [02:28<00:00, 100.72it/s]


In [53]:
print(classification_report(predictor.predict(train_features_X), flatten(y_train)))

             precision    recall  f1-score   support

      False       0.99      0.98      0.99   1139204
       True       0.43      0.71      0.53     21157

avg / total       0.98      0.98      0.98   1160361



In [54]:
print(classification_report(predictor.predict(test_features_X), flatten(y_test)))

             precision    recall  f1-score   support

      False       0.99      0.98      0.99    492100
       True       0.43      0.71      0.53      9051

avg / total       0.98      0.98      0.98    501151



50000 samples on test will give:
```
             precision    recall  f1-score   support

      False       0.99      0.98      0.99    492100
       True       0.43      0.71      0.53      9051

avg / total       0.98      0.98      0.98    501151
```
10000 samples on test will give:
```
             precision    recall  f1-score   support

      False       0.99      0.98      0.99     98400
       True       0.41      0.71      0.52      1749

avg / total       0.98      0.98      0.98    100149
```
1000 samples on test will give:
```
             precision    recall  f1-score   support

      False       1.00      0.98      0.99     10123
       True       0.30      0.76      0.43       118

avg / total       0.99      0.98      0.98     10241
```
100 samples on test will give:
```
             precision    recall  f1-score   support

      False       1.00      0.98      0.99       955
       True       0.30      0.69      0.42        13

avg / total       0.99      0.97      0.98       968
```

## Yet another model

Few hypothesis:
1. should probably split train/test before augmenting because otherwise lots of similar sentences might end up in both sets
2. my input sentences might be not too meaningful for this task - should remove those consisting from one word and those that are over 25 tokens
3. let's add to some subset of sentences words end of bye/ok thanks/etc
4. let's try some additional features

In [107]:
def create_dataset(dataset_sentences, positive_cases_ratio, test_size=0.4, max_size=100):
    size_train = math.floor(max_size*(1 - test_size))
    size_test = math.floor(max_size*test_size)
    raw_pairs = [ut.sentence_to_sequences(s) for s in dataset_sentences]
    [Xs, ys] = zip(*raw_pairs)
    X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=test_size, random_state=42)
    
    X_train = list(X_train)
    y_train = list(y_train)
    X_train, y_train = add_positive_cases(X_train, y_train, positive_cases_ratio)
    X_train, y_train = augment_partially(X_train, y_train)
    X_train, y_train = list(zip(*random.sample(list(zip(X_train, y_train)), size_train)))
    
    X_test = list(X_test)
    y_test = list(y_test)
    X_test, y_test = add_positive_cases(X_test, y_test, positive_cases_ratio)
    X_test, y_test = augment_partially(X_test, y_test)
    X_test, y_test = list(zip(*random.sample(list(zip(X_test, y_test)), size_test)))
    
    
    return X_train, X_test, y_train, y_test

In [108]:
X_train, X_test, y_train, y_test = create_dataset((enrn_sentences_normalized + wiki_sentences_normalized + news_sentences), 0.4, test_size=0.4, max_size=10000)

In [109]:
len(X_train), len(X_test)

(6000, 4000)

In [110]:
len(vec_train_X)

222175

In [111]:
vec_train_X = vectorize(X_train)
vec_test_X = vectorize(X_test)

100%|██████████| 6000/6000 [01:01<00:00, 98.12it/s] 
100%|██████████| 4000/4000 [00:40<00:00, 97.67it/s] 


In [112]:
dict_vectorizer = DictVectorizer()
train_features_X = fill_nans(dict_vectorizer.fit_transform(vec_train_X))
test_features_X = fill_nans(dict_vectorizer.transform(vec_test_X))

In [113]:
train_features_X.shape, len(flatten(y_train))

((227227, 1014), 227227)

In [114]:
predictor = LogisticRegression().fit(X=train_features_X, y=flatten(y_train))

In [115]:
print(classification_report(predictor.predict(train_features_X), flatten(y_train)))

             precision    recall  f1-score   support

      False       0.99      0.98      0.99    222458
       True       0.47      0.75      0.58      4769

avg / total       0.98      0.98      0.98    227227



In [116]:
print(classification_report(predictor.predict(test_features_X), flatten(y_test)))

             precision    recall  f1-score   support

      False       0.99      0.98      0.99    148030
       True       0.48      0.75      0.58      3241

avg / total       0.98      0.98      0.98    151271



test result:
```
             precision    recall  f1-score   support

      False       0.99      0.98      0.99    148030
       True       0.48      0.75      0.58      3241

avg / total       0.98      0.98      0.98    151271
```
which is higher than previously. I don't have a good explanation to why that happen, but several ideas:
1) variance in data/predictor
2) having smaller subset used both in test and train to create positive samples somehow causing this 🤔

In [133]:
def is_good_sentence(sent):
    return ' ' in sent and len(re.findall(' ', sent)) < 26

In [150]:
all_sents = enrn_sentences_normalized + wiki_sentences_normalized + news_sentences
filtered_sents = [sent for sent in all_sents if is_good_sentence(sent)]
X_train, X_test, y_train, y_test = create_dataset(filtered_sents, 0.4, test_size=0.4, max_size=10000)

In [151]:
vec_train_X = vectorize(X_train)
vec_test_X = vectorize(X_test)
dict_vectorizer = DictVectorizer()
train_features_X = fill_nans(dict_vectorizer.fit_transform(vec_train_X))
test_features_X = fill_nans(dict_vectorizer.transform(vec_test_X))
train_features_X.shape, len(flatten(y_train))
predictor = LogisticRegression().fit(X=train_features_X, y=flatten(y_train))

100%|██████████| 6000/6000 [00:59<00:00, 101.50it/s]
100%|██████████| 4000/4000 [00:39<00:00, 101.13it/s]


In [152]:
print(classification_report(predictor.predict(train_features_X), flatten(y_train)))

             precision    recall  f1-score   support

      False       0.99      0.98      0.99    198246
       True       0.52      0.77      0.62      5768

avg / total       0.98      0.97      0.98    204014



In [153]:
print(classification_report(predictor.predict(test_features_X), flatten(y_test)))

             precision    recall  f1-score   support

      False       0.99      0.98      0.99    132828
       True       0.51      0.75      0.61      3922

avg / total       0.98      0.97      0.97    136750



Seems to slightly improve results

In [138]:
random.random()

0.8256987460278477

In [154]:
def random_end():
    return random.choice(['bye', 'farewell', 'thanks'])
def random_start():
    return random.choice(['hello', 'hi', 'hey', 'greetings'])
def augment_partially(Xs, ys, ratio = 0.2):
    """Idea here was to do some augmentations on Xs like changing ending punctutation / names / farewells"""
    newXs = []
    newYs = []
    subset_ratio = 0.4
    subset_with_puncutation = []
    k = 0
    while len(subset_with_puncutation) < math.floor(len(Xs) * subset_ratio) and k < len(Xs):
        if any([is_finishing_punctutation(tok) for tok in Xs[k]]):
            subset_with_puncutation.append(k)
        k += 1
    for i in range(math.floor(len(Xs) * ratio)):
        random_index = random.choice(subset_with_puncutation)
        newX = [replace_punct_tok(x) for x in Xs[random_index].copy()]
        newXs.append(newX)
        newYs.append(ys[random_index].copy())
    for i in range(math.floor(len(Xs) * ratio)):
        random_index = random.randint(0, len(Xs) - 1)
        if random.random() > 0.5:
            newX = Xs[random_index].copy() + [random_end()]
            newy = ys[random_index].copy() + [False]
            newy[-2] = True
        else:
            newX = [random_start()] + Xs[random_index].copy()
            newy = [True] + ys[random_index].copy()
        newXs.append(newX)
        newYs.append(newy)
    return Xs + newXs, ys + newYs

Using same params, give yet another few percents of improvement. I'm rerunning this. Given we know test is all about run-on sentences and we don't actually care for regular setences, let's have all of those in dataset be run-on for sure. Not something I would actually do if this was a task for production, but let's see, what this will get.

In [159]:
X_train, X_test, y_train, y_test = create_dataset(filtered_sents, 1, test_size=0.4, max_size=10000)

In [160]:
vec_train_X = vectorize(X_train)
vec_test_X = vectorize(X_test)
dict_vectorizer = DictVectorizer()
train_features_X = fill_nans(dict_vectorizer.fit_transform(vec_train_X))
test_features_X = fill_nans(dict_vectorizer.transform(vec_test_X))
train_features_X.shape, len(flatten(y_train))
predictor = LogisticRegression().fit(X=train_features_X, y=flatten(y_train))

100%|██████████| 6000/6000 [00:58<00:00, 102.53it/s]
100%|██████████| 4000/4000 [00:38<00:00, 103.49it/s]


In [161]:
print(classification_report(predictor.predict(train_features_X), flatten(y_train)))

             precision    recall  f1-score   support

      False       0.99      0.98      0.99    189300
       True       0.47      0.76      0.58      4806

avg / total       0.98      0.97      0.98    194106



In [162]:
print(classification_report(predictor.predict(test_features_X), flatten(y_test)))

             precision    recall  f1-score   support

      False       0.99      0.98      0.99    126467
       True       0.45      0.75      0.56      3099

avg / total       0.98      0.97      0.98    129566



This actually made results worse. rolling back to 0.4 ratio for next one

In [163]:
X_train, X_test, y_train, y_test = create_dataset(filtered_sents, 0.4, test_size=0.4, max_size=10000)

For features, let's check
1. adding another word prior and afterwards
2. trying to infer distance for supposedly start/end of sentence for current word
3. check whether head of token is left or right of token

In [187]:
def single_tok_to_feature(label, tok):
    if not tok:
        return {
            label + '_shape': 'NONE',
            label + '_is_quote': False,
            label + '_is_punct': False,
            label + '_is_start': False,
            label + '_in_vocab': False,
            label + '_like_num': False,
            label + '_like_url': False,
            label + '_like_email': False,
            label + '_ent_type': 'NONE',
        }
    return {
        label + '_shape': tok.shape_[0:4],
        label + '_is_quote': is_quote(str(tok)),
        label + '_is_punct': tok.is_punct,
        label + '_is_start': tok.is_sent_start,
        label + '_in_vocab': str(tok) in nlp.vocab,
        label + '_like_num': tok.like_num,
        label + '_like_url': tok.like_url,
        label + '_like_email': tok.like_email,
        label + '_ent_type': tok.ent_type_
        
    }
def to_features(word, prev_1, next_1, prev_2, next_2, after_sent_start, till_send_end): # prev_2, prev_3, next_2, next_3
    return {
        'after_sent_start': after_sent_start,
        'till_send_end': till_send_end,
        'head_right': word.i < word.head.i,
        'head_left': word.i > word.head.i,
        **single_tok_to_feature('word', word),
        **single_tok_to_feature('prev_1', prev_1),
        **single_tok_to_feature('prev_2', prev_2),
        **single_tok_to_feature('next_1', next_1),
        **single_tok_to_feature('next_2', next_2)
    }
def single_doc_to_features(sentence):
    sent = [tok for tok in sentence if tok]
    doc = ut.toks_to_spacy(nlp, sent)
    res = []
    after_sent_start = 0
    till_send_end = len(doc)
    sent_ends = [tok.is_sent_start for tok in doc]
    for (i, token) in enumerate(doc):
        after_sent_start += 1
        if token.is_sent_start:
            after_sent_start = 0
        try:
            till_send_end = sent_ends[i:].index(True)
        except ValueError:
            till_send_end = len(sent_ends[i:])
        next_w = doc[i+1] if i + 1 < len(doc) else None
        next_w2 = doc[i+2] if i + 2 < len(doc) else None
        prev_w = doc[i-1] if i - 1 >= 0 else None
        prev_w2 = doc[i-2] if i - 2 >= 0 else None
        res.append(to_features(token, prev_1=prev_w, prev_2=prev_w2, next_1=next_w, next_2=next_w2, after_sent_start=after_sent_start, till_send_end=till_send_end))
    return res

In [188]:
vec_train_X = vectorize(X_train)
vec_test_X = vectorize(X_test)
dict_vectorizer = DictVectorizer()
train_features_X = fill_nans(dict_vectorizer.fit_transform(vec_train_X))
test_features_X = fill_nans(dict_vectorizer.transform(vec_test_X))
train_features_X.shape, len(flatten(y_train))
predictor = LogisticRegression().fit(X=train_features_X, y=flatten(y_train))

100%|██████████| 6000/6000 [01:06<00:00, 94.89it/s]
100%|██████████| 4000/4000 [00:43<00:00, 92.94it/s] 


In [189]:
print(classification_report(predictor.predict(train_features_X), flatten(y_train)))

             precision    recall  f1-score   support

      False       0.99      0.99      0.99    196929
       True       0.68      0.85      0.76      6921

avg / total       0.98      0.98      0.98    203850



In [190]:
print(classification_report(predictor.predict(test_features_X), flatten(y_test)))

             precision    recall  f1-score   support

      False       0.99      0.99      0.99    133053
       True       0.67      0.84      0.75      4568

avg / total       0.98      0.98      0.98    137621



This seems improve to results. I think I'm done here, so just for fun, gonna take slightly more data in, train classifier and predict on gold test.

In [191]:
X_train, X_test, y_train, y_test = create_dataset(filtered_sents, 0.4, test_size=0.4, max_size=100000)
vec_train_X = vectorize(X_train)
vec_test_X = vectorize(X_test)
dict_vectorizer = DictVectorizer()
train_features_X = fill_nans(dict_vectorizer.fit_transform(vec_train_X))
test_features_X = fill_nans(dict_vectorizer.transform(vec_test_X))
predictor = LogisticRegression().fit(X=train_features_X, y=flatten(y_train))
print(classification_report(predictor.predict(train_features_X), flatten(y_train)))
print(classification_report(predictor.predict(test_features_X), flatten(y_test)))

100%|██████████| 60000/60000 [10:51<00:00, 92.04it/s] 
100%|██████████| 40000/40000 [07:34<00:00, 87.95it/s]


             precision    recall  f1-score   support

      False       0.99      0.99      0.99   1982251
       True       0.69      0.84      0.76     70487

avg / total       0.98      0.98      0.98   2052738

             precision    recall  f1-score   support

      False       0.99      0.99      0.99   1327044
       True       0.69      0.84      0.76     47494

avg / total       0.98      0.98      0.98   1374538



In [192]:
# Okay, let's try evaluating on test
y_test_pred = predictor.predict(fill_nans(dict_vectorizer.transform(vectorize(test_X))))

100%|██████████| 200/200 [00:03<00:00, 50.02it/s]


In [193]:
print(classification_report(y_test_pred, flatten(test_y)))

             precision    recall  f1-score   support

      False       1.00      0.98      0.99      4616
       True       0.42      0.80      0.55        81

avg / total       0.99      0.98      0.98      4697



seems to be 0.39 improvement in precision compared to baseline and 0.74 in recall  
in terms of second base model trained on sentences from enron we only get 0.1 improvement in precision and 0.13 dive in recall