In [1]:
%load_ext autoreload
%autoreload 1

In [808]:
import json
import pandas as pd
import numpy as np
from collections import Counter
from pathlib import Path
import bz2
import re
import string
import spacy
import random
from itertools import groupby
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [3]:
%aimport config
from config import REPO

#### Read the data

In [4]:
file_test = "run-on-test.json"
with open(file_test) as f:
    test_data = json.load(f)

In [5]:
def sum_true(x):
    return sum([el[-1] for el in x])

In [6]:
frq = Counter([sum_true(el) for el in test_data])
for key, value in frq.most_common():
    print(f"Run-ons: {key}\nfrq: {value}, percentage: {value*100/len(test_data):.1f}%")

Run-ons: 1
frq: 145, percentage: 72.5%
Run-ons: 0
frq: 50, percentage: 25.0%
Run-ons: 2
frq: 5, percentage: 2.5%


In [7]:
def build_df(data):
    output = []
    for k,record in enumerate(data):
        temp = [{"id": k, "word": word, "label": label} for word, label in record]
        output.extend(temp)
    return pd.DataFrame(output)

In [8]:
test_df = build_df(test_data)
test_df.loc[test_df.id==120]

Unnamed: 0,id,label,word
2853,120,False,But
2854,120,False,then
2855,120,False,it
2856,120,True,started
2857,120,False,there
2858,120,False,were
2859,120,False,wails
2860,120,False,and
2861,120,False,cries
2862,120,False,from


In [9]:
# Check class imbalance
test_df.label.value_counts(normalize=True) * 100

False    96.700021
True      3.299979
Name: label, dtype: float64

#### Generate training data. Use Reddit posts

##### Clean and prepare data

In [None]:
folder = REPO / "cmv" / "all"
filename = "heldout_period_data.jsonlist.bz2"
with bz2.open(folder / filename, mode="rt") as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

In [104]:
def check_comment(comment):
    text = comment.get("body", "")
    patt = r"Confirmed:.*awarded.*|This delta is currently disallowed.*|You cannot award OP a delta as.*"
    match = re.search(patt, text)
    if not text or text == "[deleted]" or match:
        return False
    return True

In [101]:
def custom_replacement(m):
    if not m.group(1):
        return ". "
    else:
        return m.group(1)+" "

In [140]:
GRUBER_URLINTEXT_PAT = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')

In [270]:
def process_comment(comment, patt_code=r"&amp;#(\d+);"):
    text = comment.get("body", "")
    text = re.sub(r"\[(.*?)\](\s*)\((http(s?):/)?/.*?\)", r"\1", text)
    text = re.sub(r"([.!?:])?\s*[\n|\r]+\s*", custom_replacement, text)
    if re.search(patt_code, text):
        text = re.sub(patt_code, lambda x: chr(int(x.group(1), 10)), text)
    text = re.sub(r"&gt;|#(\d+);(\.)?|&lt;|&amp;nbsp;(\.)?", "", text)
    text = re.sub(r"&amp;", "and", text)
    text = GRUBER_URLINTEXT_PAT.sub("", text)
    text = text.strip()
    text = re.sub(r"(\s)+", r"\1", text)
    return text

In [272]:
comments = []
k = 0
N = 10000
for el in data:
    for comment in el["comments"]:
        if k > N:
            break
        if check_comment(comment):
            comments.append(process_comment(comment)+"\n")
            k += 1

In [273]:
with open("data.txt", "w+") as f:
    f.writelines(comments)

##### Generate Run-on sentences

In [11]:
with open("data.txt", "r") as f:
    comments = f.read().splitlines()

In [12]:
nlp = spacy.load("en", disable=["ner", "textcat"])

In [542]:
LIMIT = 3

In [613]:
def generate_indices(sentences):
    idx = []
    k = 0
    counter = 0
    for i in range(len(sentences)):
        if random.random() <= 0.19 and not counter:
            k += 1
            idx.append(k)
        else:
            if not counter:
                k += 1
            idx.append(k)
            counter += 1
            if random.random() <= 0.04 and counter < LIMIT:
                continue
            elif counter > 1:
                counter = 0
                k += 1
    return idx

In [746]:
t = doc[-1]
t.text_with_ws

'.'

In [747]:
def process_group(group):
    output = []
    temp = []
    for k,sent in enumerate(group):
        if k < len(group)-1:
            if sent[-1].pos_ == "PUNCT":
                sent = sent[:-1]
            tokens = [tok.text_with_ws for tok in sent]
            if temp and random.random() <= 0.5:
                tokens[0] = tokens[0].lower()
            labels = np.zeros(len(tokens), dtype=bool)
            labels[-1] = True
            temp.extend(zip(tokens, labels))
        else:
            tokens = [tok.text_with_ws for tok in sent]
            if temp and random.random() <= 0.5:
                tokens[0] = tokens[0].lower()
            labels = np.zeros(len(tokens), dtype=bool)
            temp.extend(zip(tokens, labels))
            output.append(temp)
            temp = []
    return output

In [16]:
SENT_LIMIT = 5

In [748]:
def get_data_from_comment(comment, limit=SENT_LIMIT):
    doc = nlp(comment)
    sentences = [sent for sent in doc.sents if len(sent)>=limit]
    examples = []
    if len(sentences) < 3:
        idx = [0] * len(sentences)
    else:
        idx = generate_indices(sentences)
    for key, g in groupby(zip(idx, sentences), key=lambda x: x[0]):
        _, group = map(list, zip(*g))
        try:
            temp = process_group(group)
        except:
            continue
        examples.extend(temp)
    return examples

In [749]:
train_data = []
for i, c in enumerate(comments):
    temp = get_data_from_comment(c)
    train_data.extend(temp)
    if i % 1000 == 0:
        print(f"Finished {i+1} out of {len(comments)}")

Finished 1 out of 10001
Finished 1001 out of 10001
Finished 2001 out of 10001
Finished 3001 out of 10001
Finished 4001 out of 10001
Finished 5001 out of 10001
Finished 6001 out of 10001
Finished 7001 out of 10001
Finished 8001 out of 10001
Finished 9001 out of 10001
Finished 10001 out of 10001


In [751]:
frq = Counter([sum_true(el) for el in train_data])
for key, value in frq.most_common():
    print(f"Run-ons: {key}\nfrq: {value}, percentage: {value*100/len(train_data):.1f}%")

Run-ons: 1
frq: 22784, percentage: 67.0%
Run-ons: 0
frq: 10468, percentage: 30.8%
Run-ons: 2
frq: 749, percentage: 2.2%


In [752]:
train_df = build_df(train_data)

In [671]:
train_df.loc[train_df.id==53];

In [753]:
# Check class imbalance
train_df.label.value_counts(normalize=True) * 100

False    97.841145
True      2.158855
Name: label, dtype: float64

##### Subsample train data (remove non run-on sentences, there are too much of them)

In [754]:
s = train_df.groupby("id")["label"].sum()

In [755]:
exclude = s[s==0].sample(frac=0.25).index

In [756]:
sample_df = train_df.loc[~train_df.id.isin(exclude)]

In [757]:
# Check class imbalance
sample_df.label.value_counts(normalize=True) * 100

False    97.737634
True      2.262366
Name: label, dtype: float64

In [758]:
len(sample_df), len(train_df)

(1073301, 1124763)

In [759]:
sample_df.groupby("id")["label"].sum().value_counts(normalize=True).map(lambda x: round(x, 3) * 100)

1.0    72.6
0.0    25.0
2.0     2.4
Name: label, dtype: float64

In [760]:
sample_df.to_csv(REPO / "train_data.csv", index=False)

#### Model training

In [684]:
sample_df = pd.read_csv(REPO / "train_data.csv")

##### Cross - validation / Train - test split

In [761]:
id_num = sample_df.groupby("id")["label"].sum()

In [721]:
# skf = StratifiedKFold(n_splits=5, shuffle=False)
# for train_index, test_index in skf.split(id_num.index, id_num.values):
#     train_groups = id_num.index[train_index]
#     test_groups = id_num.index[test_index]
#     train_df = sample_df.loc[sample_df.id.isin(train_groups)]
#     test_df = sample_df.loc[sample_df.id.isin(test_groups)]
#     print(train_df.groupby("id")["label"].sum().value_counts(normalize=True))
#     print(test_df.groupby("id")["label"].sum().value_counts(normalize=True))

In [762]:
train_sentences, test_sentences = train_test_split(id_num.index, train_size=0.7, test_size=0.3, stratify=id_num.values)
train = sample_df.loc[sample_df.id.isin(train_sentences)]
test = sample_df.loc[sample_df.id.isin(test_sentences)]
print(train.groupby("id")["label"].sum().value_counts(normalize=1))
print(test.groupby("id")["label"].sum().value_counts(normalize=1))

1.0    0.725965
0.0    0.250182
2.0    0.023853
Name: label, dtype: float64
1.0    0.725998
0.0    0.250106
2.0    0.023895
Name: label, dtype: float64


##### Feature building

In [820]:
def word2features(sent, i):
    word = sent[i]
    features = {
        'word.lower': word.lower_,
        'word.pos': word.pos_,
        'word.shape': word.shape_,
        'word.isupper': word.is_upper,
        'word.istitle': word.is_title,
        'word.isdigit': word.is_digit,
        'word.isalpha': word.is_alpha,
        'word.isbracket': word.is_bracket,
        'word.isleftpunct': word.is_left_punct,
        'word.ispunct': word.is_punct,
        'word.isquote': word.is_quote,
        'word.isspace': word.is_space,
        'word.isstop': word.is_stop
    }              
    return features

In [823]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [830]:
def build_features(df):
    output = []
    for key, gr in df.groupby("id"):
        sent = "".join(gr["word"].values)
        doc = nlp(sent)
        features = sent2features(doc)
        output.append(features)
    return output

In [834]:
tt = test.loc[test.id.isin([4,6])]
build_features(tt)

[[{'word.lower': 'my',
   'word.pos': 'ADJ',
   'word.shape': 'Xx',
   'word.isupper': False,
   'word.istitle': True,
   'word.isdigit': False,
   'word.isalpha': True,
   'word.isbracket': False,
   'word.isleftpunct': False,
   'word.ispunct': False,
   'word.isquote': False,
   'word.isspace': False,
   'word.isstop': False},
  {'word.lower': 'counterpoint',
   'word.pos': 'NOUN',
   'word.shape': 'xxxx',
   'word.isupper': False,
   'word.istitle': False,
   'word.isdigit': False,
   'word.isalpha': True,
   'word.isbracket': False,
   'word.isleftpunct': False,
   'word.ispunct': False,
   'word.isquote': False,
   'word.isspace': False,
   'word.isstop': False},
  {'word.lower': 'is',
   'word.pos': 'VERB',
   'word.shape': 'xx',
   'word.isupper': False,
   'word.istitle': False,
   'word.isdigit': False,
   'word.isalpha': True,
   'word.isbracket': False,
   'word.isleftpunct': False,
   'word.ispunct': False,
   'word.isquote': False,
   'word.isspace': False,
   'word.issto

In [768]:
w = "".join(test.loc[test.id==4, "word"].values)
doc = nlp(w)

In [821]:
word2features(doc, 0)

{'word.lower': 'my',
 'word.pos': 'ADJ',
 'word.shape': 'Xx',
 'word.isupper': False,
 'word.istitle': True,
 'word.isdigit': False,
 'word.isalpha': True,
 'word.isbracket': False,
 'word.isleftpunct': False,
 'word.ispunct': False,
 'word.isquote': False,
 'word.isspace': False,
 'word.isstop': False}