In [207]:
import json
import pandas as pd
import numpy as np
from collections import Counter
from nltk.corpus import nps_chat, reuters

#### Read the data

In [39]:
file_test = "run-on-test.json"
with open(file_test) as f:
    test_data = json.load(f)

In [40]:
def sum_true(x):
    return sum([el[-1] for el in x])

In [55]:
frq = Counter([sum_true(el) for el in test_data])
for key, value in frq.most_common():
    print(f"Run-ons: {key}\nfrq: {value}, percentage: {value*100/len(test_data):.1f}%")

Run-ons: 1
frq: 145, percentage: 72.5%
Run-ons: 0
frq: 50, percentage: 25.0%
Run-ons: 2
frq: 5, percentage: 2.5%


In [60]:
def build_df(data):
    output = []
    for k,record in enumerate(data):
        temp = [{"id": k, "word": word, "label": label} for word, label in record]
        output.extend(temp)
    return pd.DataFrame(output)

In [221]:
test_df = build_df(test_data)
test_df.loc[test_df.id==79]

Unnamed: 0,id,label,word
1856,79,False,This
1857,79,False,is
1858,79,False,evidenced
1859,79,False,by
1860,79,False,results
1861,79,False,of
1862,79,False,a
1863,79,False,public
1864,79,False,polling
1865,79,True,exercise


In [77]:
# Check class imbalance
test_df.label.value_counts(normalize=True) * 100

False    96.700021
True      3.299979
Name: label, dtype: float64

#### Generate training data. Use Reuters data

In [222]:
ids = reuters.fileids()[:4]
for s in reuters.sents(ids[-1]):
    print(s)

['THAI', 'TRADE', 'DEFICIT', 'WIDENS', 'IN', 'FIRST', 'QUARTER', 'Thailand', "'", 's', 'trade', 'deficit', 'widened', 'to', '4', '.', '5', 'billion', 'baht', 'in', 'the', 'first', 'quarter', 'of', '1987', 'from', '2', '.', '1', 'billion', 'a', 'year', 'ago', ',', 'the', 'Business', 'Economics', 'Department', 'said', '.']
['It', 'said', 'Janunary', '/', 'March', 'imports', 'rose', 'to', '65', '.', '1', 'billion', 'baht', 'from', '58', '.', '7', 'billion', '.']
['Thailand', "'", 's', 'improved', 'business', 'climate', 'this', 'year', 'resulted', 'in', 'a', '27', 'pct', 'increase', 'in', 'imports', 'of', 'raw', 'materials', 'and', 'semi', '-', 'finished', 'products', '.']
['The', 'country', "'", 's', 'oil', 'import', 'bill', ',', 'however', ',', 'fell', '23', 'pct', 'in', 'the', 'first', 'quarter', 'due', 'to', 'lower', 'oil', 'prices', '.']
['The', 'department', 'said', 'first', 'quarter', 'exports', 'expanded', 'to', '60', '.', '6', 'billion', 'baht', 'from', '56', '.', '6', 'billion', 