In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import json
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from pathlib import Path
import bz2
import re
import string
import spacy
import random
from itertools import groupby, chain, tee
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import nltk.corpus.reader.bnc
from spacy.tokens import Doc
from time import time
import dill

In [3]:
%aimport config
from config import REPO

#### Read the data

In [4]:
file_val = "run-on-test.json"
with open(file_val) as f:
    val_data = json.load(f)

In [5]:
def sum_true(x):
    return sum([el[-1] for el in x])

In [6]:
def print_counter(counter, name):
    for key, value in counter.most_common():
        print(f"{name}: {key}\ncount: {value}, percentage: {value*100/sum(counter.values()):.1f}%")

In [7]:
frq = Counter([sum_true(el) for el in val_data])
print_counter(frq, "Run-ons")

Run-ons: 1
count: 145, percentage: 72.5%
Run-ons: 0
count: 50, percentage: 25.0%
Run-ons: 2
count: 5, percentage: 2.5%


In [7]:
def build_df(data):
    output = []
    for k,record in enumerate(data):
        temp = [{"id": k, "word": word, "label": label} for word, label in record]
        output.extend(temp)
    return pd.DataFrame(output)

In [8]:
val_df = build_df(val_data)
val_df.loc[val_df.id==120]

Unnamed: 0,id,label,word
2853,120,False,But
2854,120,False,then
2855,120,False,it
2856,120,True,started
2857,120,False,there
2858,120,False,were
2859,120,False,wails
2860,120,False,and
2861,120,False,cries
2862,120,False,from


In [9]:
# Check class imbalance
val_df.label.value_counts(normalize=True) * 100

False    96.700021
True      3.299979
Name: label, dtype: float64

#### Generate training data. Use Reddit posts

##### Clean and prepare data

In [11]:
folder = REPO / "cmv" / "all"
filename = "heldout_period_data.jsonlist.bz2"
with bz2.open(folder / filename, mode="rt") as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

In [12]:
def check_comment(comment):
    text = comment.get("body", "")
    patt = r"Confirmed:.*awarded.*|This delta is currently disallowed.*|You cannot award OP a delta as.*"
    match = re.search(patt, text)
    if not text or text == "[deleted]" or match:
        return False
    return True

In [13]:
def custom_replacement(m):
    if not m.group(1):
        return ". "
    else:
        return m.group(1)+" "

In [None]:
GRUBER_URLINTEXT_PAT = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')

In [14]:
WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""

In [15]:
def process_comment(comment, patt_code=r"&amp;#(\d+);"):
    text = comment.get("body", "")
    text = re.sub(r"\[(.*?)\](\s*)\((http(s?):/)?/.*?\)", r"\1", text)
    text = re.sub(r"([.!?:])?\s*[\n|\r]+\s*", custom_replacement, text)
    if re.search(patt_code, text):
        text = re.sub(patt_code, lambda x: chr(int(x.group(1), 10)), text)
    text = re.sub(r"&gt;|#(\d+);(\.)?|&lt;|&amp;nbsp;(\.)?", "", text)
    text = re.sub(r"&amp;", "and", text)
    text = re.sub(WEB_URL_REGEX, "", text)
    text = text.strip()
    text = re.sub(r"(\s)+", r"\1", text)
    return text

In [16]:
comments = []
k = 0
N = 10000
for el in data:
    for comment in el["comments"]:
        if k > N:
            break
        if check_comment(comment):
            comments.append(process_comment(comment)+"\n")
            k += 1

In [None]:
with open("data.txt", "w+") as f:
    f.writelines(comments)

##### Estimate N grams based on Reddit comments (accross sentence boundaries)

In [17]:
nlp = spacy.load("en", disable=["ner", "textcat"])

In [17]:
# Data to estimate N grams
ngr = []
k = 0
for el in data:
    for comment in el["comments"]:
        if check_comment(comment):
            ngr.append(process_comment(comment)+"\n")
            k += 1

In [18]:
def zipngram(doc, n=2):
    return zip(*[doc[i:] for i in range(n)])

In [21]:
t0 = time()
NGRAMS = defaultdict(Counter)
docs = nlp.pipe(ngr[:50000], disable=["ner", "textcat"], batch_size=10 ** 3, n_threads=4)
for doc in docs:
    text = [tok.lemma_ if tok.lemma_ != '-PRON-' else tok.lower_ for tok in doc]
    NGRAMS[1].update(text)
    NGRAMS[2].update(zipngram(text)) 
    NGRAMS[3].update(zipngram(text, 3)) 
print(time() - t0)

891.1918799877167


In [35]:
with (REPO / "ngrams.dill").open("wb+") as f:
    dill.dump(NGRAMS, f)

In [10]:
with (REPO / "ngrams.dill").open("rb") as f:
    NGRAMS = dill.load(f)

##### Generate Run-on sentences

In [44]:
with open("data.txt", "r") as f:
    comments = f.read().splitlines()

In [51]:
LIMIT = 3
SENT_LIMIT = 5

In [46]:
def generate_indices(sentences):
    idx = []
    k = 0
    counter = 0
    for i in range(len(sentences)):
        if random.random() <= 0.19 and not counter:
            k += 1
            idx.append(k)
        else:
            if not counter:
                k += 1
            idx.append(k)
            counter += 1
            if random.random() <= 0.04 and counter < LIMIT:
                continue
            elif counter > 1:
                counter = 0
                k += 1
    return idx

In [47]:
def process_group(group):
    output = []
    temp = []
    for k,sent in enumerate(group):
        if k < len(group)-1:
            if sent[-1].pos_ == "PUNCT":
                sent = sent[:-1]           
            tokens = [tok.text_with_ws for tok in sent]
            if len(sent[-1].text) == len(sent[-1].text_with_ws):
                tokens[-1] = tokens[-1] + " "
            if temp and random.random() <= 0.5:
                tokens[0] = tokens[0].lower()
            labels = np.zeros(len(tokens), dtype=bool)
            labels[-1] = True
            temp.extend(zip(tokens, labels))
        else:
            tokens = [tok.text_with_ws for tok in sent]
            if temp and random.random() <= 0.5:
                tokens[0] = tokens[0].lower()
            labels = np.zeros(len(tokens), dtype=bool)
            temp.extend(zip(tokens, labels))
            output.append(temp)
            temp = []
    return output

In [49]:
def get_data_from_comment(comment, limit=SENT_LIMIT):
    doc = nlp(comment)
    sentences = [sent for sent in doc.sents if len(sent)>=limit]
    examples = []
    if len(sentences) < 3:
        idx = [0] * len(sentences)
    else:
        idx = generate_indices(sentences)
    for key, g in groupby(zip(idx, sentences), key=lambda x: x[0]):
        _, group = map(list, zip(*g))
        try:
            temp = process_group(group)
        except:
            continue
        examples.extend(temp)
    return examples

In [None]:
train_data = []
for i, c in enumerate(comments):
    temp = get_data_from_comment(c)
    train_data.extend(temp)
    if i % 1000 == 0:
        print(f"Finished {i+1} out of {len(comments)}")

In [811]:
frq = Counter([sum_true(el) for el in train_data])
print_counter(frq, "Run-ons")

Run-ons: 1
frq: 22656, percentage: 66.5%
Run-ons: 0
frq: 10604, percentage: 31.1%
Run-ons: 2
frq: 789, percentage: 2.3%


In [812]:
train_df = build_df(train_data)

In [671]:
train_df.loc[train_df.id==53];

In [813]:
# Check class imbalance
train_df.label.value_counts(normalize=True) * 100

False    97.845464
True      2.154536
Name: label, dtype: float64

##### Subsample train data (remove non run-on sentences, there are too much of them)

In [814]:
s = train_df.groupby("id")["label"].sum()

In [815]:
exclude = s[s==0].sample(frac=0.25).index

In [816]:
sample_df = train_df.loc[~train_df.id.isin(exclude)]

In [817]:
# Check class imbalance
sample_df.label.value_counts(normalize=True) * 100

False    97.739146
True      2.260854
Name: label, dtype: float64

In [818]:
len(sample_df), len(train_df)

(1071896, 1124790)

In [819]:
sample_df.groupby("id")["label"].sum().value_counts(normalize=True).map(lambda x: round(x, 3) * 100)

1.0    72.2
0.0    25.3
2.0     2.5
Name: label, dtype: float64

In [820]:
sample_df.to_csv(REPO / "train_data.csv", index=False)

#### Model training

In [11]:
def calc_metrics(y_test, pred, proba=None, labels=None, print_=True, mode="weighted"):
    output = {}
    if proba is not None:
        roc_auc = metrics.roc_auc_score(y_test, proba)
        output["AUC"] = roc_auc
    output["Recall"] = metrics.recall_score(y_test, pred, average=mode)
    output["Precision"] = metrics.precision_score(y_test, pred, average=mode)
    output["F1"] = metrics.f1_score(y_test, pred, average=mode)
    output["accuracy"] = metrics.accuracy_score(y_test, pred)
    if labels is not None:
        index = labels
        columns = ["pred_" + str(el) for el in index]
    else:
        columns = None
        index = None
    output["conf_matrix"] = pd.DataFrame(metrics.confusion_matrix(y_test, pred, labels=labels), 
                                         columns=columns, index=index)
    report = metrics.classification_report(y_true=y_test, y_pred=pred, labels=labels)
    if print_:
        for key, value in output.items():
            if "matrix" in key:
                print(value)
            else:
                print(f"{key}: {value:0.3f}")
        print(report)
    return output, report

In [12]:
sample_df = pd.read_csv(REPO / "train_data.csv")

In [13]:
sample_df["word"] = sample_df["word"].astype(str)

##### Cross - validation / Train - test split

In [14]:
id_num = sample_df.groupby("id")["label"].sum()

In [721]:
# skf = StratifiedKFold(n_splits=5, shuffle=False)
# for train_index, test_index in skf.split(id_num.index, id_num.values):
#     train_groups = id_num.index[train_index]
#     test_groups = id_num.index[test_index]
#     train_df = sample_df.loc[sample_df.id.isin(train_groups)]
#     test_df = sample_df.loc[sample_df.id.isin(test_groups)]
#     print(train_df.groupby("id")["label"].sum().value_counts(normalize=True))
#     print(test_df.groupby("id")["label"].sum().value_counts(normalize=True))

In [15]:
train_sentences, test_sentences = train_test_split(id_num.index, train_size=0.7, test_size=0.3, 
                                                   stratify=id_num.values, random_state=25)
train = sample_df.loc[sample_df.id.isin(train_sentences)]
test = sample_df.loc[sample_df.id.isin(test_sentences)]
print(train.groupby("id")["label"].sum().value_counts(normalize=1))
print(test.groupby("id")["label"].sum().value_counts(normalize=1))

1.0    0.721585
0.0    0.253299
2.0    0.025116
Name: label, dtype: float64
1.0    0.721550
0.0    0.253291
2.0    0.025159
Name: label, dtype: float64


##### Feature building

In [18]:
class WordTokenizer(object):
    """
    Custom Tokenizer
    """
    def __init__(self, vocab=nlp.vocab, tokenizer=None, return_doc=True):
        self.vocab = vocab
        self._word_tokenizer = tokenizer
        self.return_doc = return_doc

    def __call__(self, text):
        if self._word_tokenizer:
            words = self._word_tokenizer.tokenize(text)
        else:
            words = text.split(' ')
        if self.return_doc:
            spaces = [True] * len(words)
            return Doc(self.vocab, words=words, spaces=spaces)
        else:
            return words

In [19]:
def word2features(sent, i, n=2):
    word = sent[i]
    lemma = word.lemma_
    features = {
        'word.lemma': lemma,
        #'word_frq': word_frq.get(lemma, 1),
        'word.pos': word.pos_,
        'word.shape': word.shape_,
        'word.isupper': word.is_upper,
        'word.istitle': word.is_title,
        'word.isdigit': word.is_digit,
        'word.isalpha': word.is_alpha,
        'word.isbracket': word.is_bracket,
        'word.isleftpunct': word.is_left_punct,
        'word.ispunct': word.is_punct,
        'word.isquote': word.is_quote,
        'word.isspace': word.is_space,
        'word.isstop': word.is_stop
    }
    #features["start_word"] = start_gram[1].get(lemma, 0) / word_frq.get(lemma, 1)
    features["word_dot"] = NGRAMS[2].get((lemma, "."), 0) / NGRAMS[1].get(lemma, 1)
    if i > 0:
        prefix = "word-1"
        word1 = sent[i-1]
        lemma1 = word1.lemma_
        #features[f"{prefix}_frq"] = word_frq.get(lemma1, 1)
        #features[f"l_{n}_gram"] = ngrams.get((lemma1, lemma), 0)
        features[f"prob_l_{n}_gram"] = NGRAMS[2].get((lemma1, lemma), 0) / NGRAMS[1].get(lemma1, 1)#features[f"{prefix}_frq"]
        features[f"prob_l_{n}_gram_dot"] = NGRAMS[3].get((lemma1, lemma, "."), 0) / NGRAMS[2].get((lemma1, lemma), 1)
        #features[f"start_l_{n}_gram"] = start_gram[2].get((lemma1, lemma), 0) / ngrams.get((lemma1, lemma), 1)
        #features[f"end_l_{n}_gram"] = end_gram[2].get((lemma1, lemma), 0) / ngrams.get((lemma1, lemma), 1)
        features.update({
            f"{prefix}.lemma": lemma1,
            f"{prefix}.pos": word1.pos_,
            f"{prefix}.shape": word1.shape_,
            f"{prefix}.istitle": word1.is_title,
            f"{prefix}.isdigit": word1.is_digit,
            f"{prefix}.isalpha": word.is_alpha,
            f"{prefix}.isbracket": word1.is_bracket,
            f"{prefix}.isleftpunct": word1.is_left_punct,
            f"{prefix}.ispunct": word1.is_punct,
            f"{prefix}.isquote": word1.is_quote,
            f"{prefix}.isspace": word1.is_space,
            f"{prefix}.isstop": word1.is_stop
        })
    else:
        features["BOS"] = True
    
    if i < len(sent) - 1:
        prefix = "word+1"
        word1 = sent[i+1]
        lemma1 = word1.lemma_
        #features[f"{prefix}_frq"] = word_frq.get(lemma1, 1)
        #features[f"r_{n}_gram"] = ngrams.get((lemma, lemma1), 0)
        features[f"prob_r_{n}_gram"] = NGRAMS[2].get((lemma, lemma1), 0) / NGRAMS[1].get(lemma, 1)#features[f"word_frq"]
        #features[f"start_r_{n}_gram"] = start_gram[2].get((lemma, lemma1), 0) / ngrams.get((lemma, lemma1), 1)
        #features[f"end_r_{n}_gram"] = end_gram[2].get((lemma, lemma1), 0) / ngrams.get((lemma, lemma1), 1)
        features.update({
            f"{prefix}.lemma": lemma1,
            f"{prefix}.pos": word1.pos_,
            f"{prefix}.shape": word1.shape_,
            f"{prefix}.istitle": word1.is_title,
            f"{prefix}.isdigit": word1.is_digit,
            f"{prefix}.isalpha": word.is_alpha,
            f"{prefix}.isbracket": word1.is_bracket,
            f"{prefix}.isleftpunct": word1.is_left_punct,
            f"{prefix}.ispunct": word1.is_punct,
            f"{prefix}.isquote": word1.is_quote,
            f"{prefix}.isspace": word1.is_space,
            f"{prefix}.isstop": word1.is_stop
        })
    else:
        features["EOS"] = True
    
    return features

In [43]:
w = "".join(test.loc[test.id==13, "word"].values)
doc = nlp(w)
t = doc[0]
t.dep_

'prep'

In [52]:
word2features(doc, 22);

In [20]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [22]:
def build_features(df, n_threads=4, batch_size=1000, is_val=False):
    sent_labels = ((key, gr["word"].values, gr["label"].values) for key, gr in df.groupby("id"))
    gen1, gen2, gen3 = tee(sent_labels, 3)
    ids = (key for (key, _, _) in gen1)
    sep = " " if is_val else ""
    sents = (sep.join(words) for (_, words, _) in gen2)
    lengths = (len(words) for (_, words, _) in gen3)
    docs = nlp.pipe(sents, disable=["ner", "textcat"], batch_size=batch_size, n_threads=n_threads)
    output = []
    exclude_ids = []
    for id_, doc, length in zip(ids, docs, lengths):
        if len(doc) != length:
                exclude_ids.append(id_)
                continue
        features = sent2features(doc)
        output.append(features) 
    return output, exclude_ids

In [23]:
train_features, exclude_id_train = build_features(train)

In [24]:
test_features, exclude_id_test = build_features(test)

In [25]:
nlp.tokenizer = WordTokenizer(nlp.vocab)
val_features, exclude_id_val = build_features(val_df, is_val=True)
nlp.tokenizer = nlp.Defaults.create_tokenizer(nlp)

In [26]:
len(exclude_id_test), len(exclude_id_train), len(exclude_id_val)

(9, 37, 0)

In [31]:
v = DictVectorizer(sparse=True)
train_v = v.fit_transform(chain(*train_features))

In [32]:
test_v = v.transform(chain(*test_features))
val_v = v.transform(chain(*val_features))

In [33]:
y_train = train.loc[~train.id.isin(exclude_id_train), "label"]
y_test = test.loc[~test.id.isin(exclude_id_test), "label"]
y_val = val_df.loc[~val_df.id.isin(exclude_id_val), "label"]

In [34]:
clf = LogisticRegression(class_weight="balanced", n_jobs=-1)

In [35]:
clf.fit(train_v, y_train)

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [36]:
y_pred = clf.predict(test_v)
labels = clf.classes_
output, report = calc_metrics(pred=y_pred, y_test=y_test, labels=labels)

Recall: 0.945
Precision: 0.980
F1: 0.958
accuracy: 0.945
       pred_False  pred_True
False      296619      16755
True          960       6303
             precision    recall  f1-score   support

      False       1.00      0.95      0.97    313374
       True       0.27      0.87      0.42      7263

avg / total       0.98      0.94      0.96    320637



In [37]:
y_pred_val = clf.predict(val_v)
labels = clf.classes_
output, report = calc_metrics(pred=y_pred_val, y_test=y_val, labels=labels)

Recall: 0.947
Precision: 0.971
F1: 0.956
accuracy: 0.947
       pred_False  pred_True
False        4327        215
True           35        120
             precision    recall  f1-score   support

      False       0.99      0.95      0.97      4542
       True       0.36      0.77      0.49       155

avg / total       0.97      0.95      0.96      4697

