In [1]:
%load_ext autoreload
%autoreload 1

In [673]:
import json
import pandas as pd
import numpy as np
from collections import Counter
from pathlib import Path
import bz2
import re
import string
import spacy
import random

In [3]:
%aimport config
from config import REPO

#### Read the data

In [27]:
file_test = "run-on-test.json"
with open(file_test) as f:
    test_data = json.load(f)

In [28]:
def sum_true(x):
    return sum([el[-1] for el in x])

In [29]:
frq = Counter([sum_true(el) for el in test_data])
for key, value in frq.most_common():
    print(f"Run-ons: {key}\nfrq: {value}, percentage: {value*100/len(test_data):.1f}%")

Run-ons: 1
frq: 145, percentage: 72.5%
Run-ons: 0
frq: 50, percentage: 25.0%
Run-ons: 2
frq: 5, percentage: 2.5%


In [5]:
def build_df(data):
    output = []
    for k,record in enumerate(data):
        temp = [{"id": k, "word": word, "label": label} for word, label in record]
        output.extend(temp)
    return pd.DataFrame(output)

In [6]:
test_df = build_df(test_data)
test_df.loc[test_df.id==120]

Unnamed: 0,id,label,word
2853,120,False,But
2854,120,False,then
2855,120,False,it
2856,120,True,started
2857,120,False,there
2858,120,False,were
2859,120,False,wails
2860,120,False,and
2861,120,False,cries
2862,120,False,from


In [7]:
# Check class imbalance
test_df.label.value_counts(normalize=True) * 100

False    96.700021
True      3.299979
Name: label, dtype: float64

#### Generate training data. Use Reddit posts

##### Clean and prepare data

In [152]:
folder = REPO / "cmv" / "all"
filename = "heldout_period_data.jsonlist.bz2"
with bz2.open(folder / filename, mode="rt") as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

In [104]:
def check_comment(comment):
    text = comment.get("body", "")
    patt = r"Confirmed:.*awarded.*|This delta is currently disallowed.*|You cannot award OP a delta as.*"
    match = re.search(patt, text)
    if not text or text == "[deleted]" or match:
        return False
    return True

In [101]:
def custom_replacement(m):
    if not m.group(1):
        return ". "
    else:
        return m.group(1)+" "

In [140]:
GRUBER_URLINTEXT_PAT = re.compile(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')

In [141]:
def process_comment(comment, patt_code=r"&amp;#(\d+);"):
    text = comment.get("body", "")
    text = re.sub(r"\[(.*?)\](\s*)\((http(s?):/)?/.*?\)", r"\1", text)
    text = re.sub(r"([.!?:])?\s*[\n|\r]+\s*", custom_replacement, text)
    if re.search(patt_code, text):
        text = re.sub(patt_code, lambda x: chr(int(x.group(1), 10)), text)
    text = re.sub(r"&gt;|#(\d+);(\.)?|&lt;|&amp;nbsp;(\.)?", "", text)
    text = re.sub(r"&amp;", "and", text)
    text = GRUBER_URLINTEXT_PAT.sub("", text)
    text = text.strip()
    return text

In [154]:
comments = []
k = 0
N = 10000
for el in data:
    for comment in el["comments"]:
        if k > N:
            break
        if check_comment(comment):
            comments.append(process_comment(comment)+"\n")
            k += 1

In [155]:
with open("data.txt", "w+") as f:
    f.writelines(comments)

##### Generate Run-on sentences

In [8]:
with open("data.txt", "r") as f:
    comments = f.read().splitlines()

In [9]:
nlp = spacy.load("en", disable=["ner", "textcat"])

In [13]:
doc = nlp(comments[5])
sentences = [sent for sent in doc.sents]

In [25]:
sentences[0][-1].pos_

'PUNCT'

In [94]:
sentences

[In the event that the President is either killed or resigns, the Vice President is a horrible choice to take over office.,
 Seriously, stop with the hyperbole.,
 Say this with a straight face "Teddy Roosevelt was a horrible President.",
 Or "Harry Truman was a horrible President.".  ,
 I'm willing to bet that John Boehner would have an easier time dealing with Congress as President than Joe Biden would due to his constant interaction with it.,
 Do you think that could have *anything* to do with the fact that Boehner is a Republican, and Congress is controlled by Republicans?  ,
 That argument has much less to do with the individuals than it does with the current party in control.]

In [432]:
sentences = list("abcdefg")

In [804]:
idx = []
k = 0
counter = 0
LIMIT = 2
for i in range(len(sentences)):
    if random.random() <= 0.2 and not counter:
        k += 1
        idx.append(k)
    else:
        idx.append(k)
        counter += 1
        if random.random() <= 0.025 and counter < LIMIT:
            continue
        else:
            counter = 0
            k += 1

In [805]:
idx

[0, 1, 3, 3, 5, 5, 7]