In [43]:
from os import listdir
from os.path import isfile, join
from typing import List, Dict
import string
import random

In [19]:
def load_texts():
    f_names = [join('corpus', f) for f in listdir('corpus') if isfile(join('corpus', f))]
    f_names = [f for f in f_names if f.lower().endswith('.txt')]
    print(f_names)
    for fn in f_names:
        with open(fn, 'rt', encoding='utf-8', errors='replace') as f:
            text = f.read()
            if not text[0] in {'+', '*'}:
                print(f"File '{fn}' is not annotated, skipped.")
                continue
            print(f"File '{fn}' is annotated, add to collection.")
            yield text
            
raw_corpus = list(load_texts())
print(len(raw_corpus))

['corpus\\1005058.txt', 'corpus\\1005395.txt', 'corpus\\104888.txt', 'corpus\\105529.txt', 'corpus\\200850.txt', 'corpus\\200851.txt', 'corpus\\300125.txt', 'corpus\\300138.txt', 'corpus\\500150.txt', 'corpus\\500486.txt', 'corpus\\601777.txt', 'corpus\\601779.txt']
File 'corpus\1005058.txt' is annotated, add to collection.
File 'corpus\1005395.txt' is annotated, add to collection.
File 'corpus\104888.txt' is not annotated, skipped.
File 'corpus\105529.txt' is not annotated, skipped.
File 'corpus\200850.txt' is annotated, add to collection.
File 'corpus\200851.txt' is not annotated, skipped.
File 'corpus\300125.txt' is not annotated, skipped.
File 'corpus\300138.txt' is not annotated, skipped.
File 'corpus\500150.txt' is not annotated, skipped.
File 'corpus\500486.txt' is not annotated, skipped.
File 'corpus\601777.txt' is not annotated, skipped.
File 'corpus\601779.txt' is not annotated, skipped.
3


In [36]:
def mean_in_window(lines, i)->float:
    start = max(i-5, 0)
    finish = min(i+5, len(lines)-1)
    sm, count = 0, 0
    for n in range(start, finish):
        sm += len(lines[n])-1  # minus one-char prefix
        count += 1
    return sm / max(count, 1)

def last_char(line: str)->str:    
    return ' ' if len(line)<1 else line[-1]
    
def last_char_features(l_char: str)->Dict[str, object]:
    res = {
        'isalpha': l_char.isalpha(),
        'isdigit': l_char.isdigit(),
        'islower': l_char.islower(),
        'punct': l_char if l_char in string.punctuation else ' ',
    }
    return res


def first_chars(line: str)->str:    
    if len(line)<1:
        chars = ' '
    elif len(line)<2:
        chars = line[0]
    else:
        chars = line[:2]
    res = []
    for c in chars:
        if c.isdigit():
            res.append('0')
        elif c.isalpha():
            res.append('a' if c.islower() else 'A')
        else:
            res.append(c)
    return ''.join(res)


def featurize_text(text: str)->(List[object], List[bool]):
    lines = text.strip().splitlines()
    total_lines = len(lines)
    x, y = [], []
    for i, line in enumerate(lines):        
        features = {}
        y.append(line[0]=='+')  # True, if line should be glued with previous
        line = line[1:]
        # print(y[-1], line)
        this_len = len(line)
        mean_len = mean_in_window(lines, i)
        if i>1:
            prev_len = len(lines[-1])-1
            l_char = last_char(lines[-1])
        else:
            prev_len = 0
            l_char = ' '
        prev_glued = 0  # How many lines before was glued
        for p in range(i-1, max(0, i-10), -1):  # Calc only up to ten items in the sequence
            if y[p]:
                prev_glued += 1
            else: 
                break
        features.update(
            {
                'this_len': this_len,
                'mean_len': mean_len,
                'prev_len': prev_len,
                'prev_glued': prev_glued,
                'first_chars': first_chars(line),
            })
        features.update(last_char_features(l_char))
        x.append([features])
    return x, y

In [37]:
x, y = featurize_text(raw_corpus[0])

In [41]:
print(x[:10])

[[{'this_len': 12, 'mean_len': 75.0, 'prev_len': 0, 'prev_glued': 0, 'first_chars': 'Aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': ' '}], [{'this_len': 97, 'mean_len': 79.33333333333333, 'prev_len': 0, 'prev_glued': 0, 'first_chars': 'Aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': ' '}], [{'this_len': 104, 'mean_len': 82.71428571428571, 'prev_len': 11, 'prev_glued': 0, 'first_chars': 'aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': '.'}], [{'this_len': 62, 'mean_len': 79.875, 'prev_len': 11, 'prev_glued': 1, 'first_chars': 'a-', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': '.'}], [{'this_len': 100, 'mean_len': 81.88888888888889, 'prev_len': 11, 'prev_glued': 2, 'first_chars': 'Aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': '.'}], [{'this_len': 101, 'mean_len': 84.2, 'prev_len': 11, 'prev_glued': 0, 'first_chars': 'aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': '.'}

In [42]:
xx, yy = [], []
for raw_text in raw_corpus:
    x, y = featurize_text(raw_text)
    xx+=x
    yy+=y
print(f"Total samples: {len(yy)}")
print(f"Positive samples: {sum(y for y in yy if y)}")

Total samples: 2300
Positive samples: 1611


In [53]:
random.seed(1974)
combined = list(zip(xx, yy))
random.shuffle(combined)
xx[:], yy[:] = zip(*combined)