In [1]:
from os import listdir
from os.path import isfile, join
from typing import List, Dict
import string
import random

In [2]:
def load_texts():
    f_names = [join('corpus', f) for f in listdir('corpus') if isfile(join('corpus', f))]
    f_names = [f for f in f_names if f.lower().endswith('.txt')]
    print(f_names)
    for fn in f_names:
        with open(fn, 'rt', encoding='utf-8', errors='replace') as f:
            text = f.read()
            if not text[0] in {'+', '*'}:
                print(f"File '{fn}' is not annotated, skipped.")
                continue
            print(f"File '{fn}' is annotated, add to collection.")
            yield text
            
raw_corpus = list(load_texts())
print(len(raw_corpus))

['corpus\\1005058.txt', 'corpus\\1005395.txt', 'corpus\\104888.txt', 'corpus\\105529.txt', 'corpus\\200850.txt', 'corpus\\200851.txt', 'corpus\\300125.txt', 'corpus\\300138.txt', 'corpus\\500150.txt', 'corpus\\500486.txt', 'corpus\\601777.txt', 'corpus\\601779.txt']
File 'corpus\1005058.txt' is annotated, add to collection.
File 'corpus\1005395.txt' is annotated, add to collection.
File 'corpus\104888.txt' is not annotated, skipped.
File 'corpus\105529.txt' is not annotated, skipped.
File 'corpus\200850.txt' is annotated, add to collection.
File 'corpus\200851.txt' is not annotated, skipped.
File 'corpus\300125.txt' is not annotated, skipped.
File 'corpus\300138.txt' is not annotated, skipped.
File 'corpus\500150.txt' is not annotated, skipped.
File 'corpus\500486.txt' is not annotated, skipped.
File 'corpus\601777.txt' is not annotated, skipped.
File 'corpus\601779.txt' is not annotated, skipped.
3


In [10]:
def mean_in_window(lines, i)->float:
    start = max(i-5, 0)
    finish = min(i+5, len(lines)-1)
    sm, count = 0, 0
    for n in range(start, finish):
        sm += len(lines[n])-1  # minus one-char prefix
        count += 1
    return sm / max(count, 1)

def last_char(line: str)->str:    
    return ' ' if len(line)<1 else line[-1]
    
def last_char_features(l_char: str)->Dict[str, object]:
    res = {
        'isalpha': l_char.isalpha(),
        'isdigit': l_char.isdigit(),
        'islower': l_char.islower(),
        'punct': l_char if l_char in string.punctuation else ' ',
    }
    return res


def first_chars(line: str)->str:    
    if len(line)<1:
        chars = ' '
    elif len(line)<2:
        chars = line[0]
    else:
        chars = line[:2]
    res = []
    for c in chars:
        if c.isdigit():
            res.append('0')
        elif c.isalpha():
            res.append('a' if c.islower() else 'A')
        else:
            res.append(c)
    return ''.join(res)

def line_to_features(line: str)->Dict[str, object]:
    features = {}
    this_len = len(line)
    mean_len = mean_in_window(lines, i)
    if i>1:
        prev_len = len(lines[-1])-1
        l_char = last_char(lines[-1])
    else:
        prev_len = 0
        l_char = ' '
    prev_glued = 0  # How many lines before was glued
    for p in range(i-1, max(0, i-10), -1):  # Calc only up to ten items in the sequence
        if y[p]:
            prev_glued += 1
        else: 
            break
    features.update(
        {
            'this_len': this_len,
            'mean_len': mean_len,
            'prev_len': prev_len,
            'prev_glued': prev_glued,
            'first_chars': first_chars(line),
        })
    features.update(last_char_features(l_char))
    x.append(features)

def featurize_text_with_annotation(text: str)->(List[object], List[bool]):
    lines = text.strip().splitlines()
    total_lines = len(lines)
    x, y = [], []
    for i, line in enumerate(lines):        
        features = {}
        y.append(line[0]=='+')  # True, if line should be glued with previous
        line = line[1:]
        # print(y[-1], line)
        this_len = len(line)
        mean_len = mean_in_window(lines, i)
        if i>1:
            prev_len = len(lines[-1])-1
            l_char = last_char(lines[-1])
        else:
            prev_len = 0
            l_char = ' '
        prev_glued = 0  # How many lines before was glued
        for p in range(i-1, max(0, i-10), -1):  # Calc only up to ten items in the sequence
            if y[p]:
                prev_glued += 1
            else: 
                break
        features.update(
            {
                'this_len': this_len,
                'mean_len': mean_len,
                'prev_len': prev_len,
                'prev_glued': prev_glued,
                'first_chars': first_chars(line),
            })
        features.update(last_char_features(l_char))
        x.append(features)
    return x, y

In [11]:
x, y = featurize_text_with_annotation(raw_corpus[0])

In [12]:
print(x[:10])

[{'this_len': 12, 'mean_len': 75.0, 'prev_len': 0, 'prev_glued': 0, 'first_chars': 'Aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': ' '}, {'this_len': 97, 'mean_len': 79.33333333333333, 'prev_len': 0, 'prev_glued': 0, 'first_chars': 'Aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': ' '}, {'this_len': 104, 'mean_len': 82.71428571428571, 'prev_len': 11, 'prev_glued': 0, 'first_chars': 'aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': '.'}, {'this_len': 62, 'mean_len': 79.875, 'prev_len': 11, 'prev_glued': 1, 'first_chars': 'a-', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': '.'}, {'this_len': 100, 'mean_len': 81.88888888888889, 'prev_len': 11, 'prev_glued': 2, 'first_chars': 'Aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': '.'}, {'this_len': 101, 'mean_len': 84.2, 'prev_len': 11, 'prev_glued': 0, 'first_chars': 'aa', 'isalpha': False, 'isdigit': False, 'islower': False, 'punct': '.'}, {'this_le

In [13]:
xx, yy = [], []
for raw_text in raw_corpus:
    x, y = featurize_text(raw_text)
    xx+=x
    yy+=y
print(f"Total samples: {len(yy)}")
print(f"Positive samples: {sum(y for y in yy if y)}")

Total samples: 2300
Positive samples: 1611


In [14]:
random.seed(1974)
combined = list(zip(xx, yy))
random.shuffle(combined)
xx[:], yy[:] = zip(*combined)

In [23]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

In [21]:
v = DictVectorizer(sparse=False)
v.fit(xx)

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)

In [22]:
xx_features = v.transform(xx)
print(xx_features[:1])

[[ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.
  39.1  1.  30.   0.   1.  36. ]]


In [28]:
x_train, x_test, y_train, y_test = train_test_split(xx_features, yy, test_size=0.3, random_state=1974)

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [37]:
#clf = RandomForestClassifier(random_state=1974)
clf = LogisticRegression(random_state=1974)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

       False       0.79      0.86      0.82       207
        True       0.94      0.90      0.92       483

   micro avg       0.89      0.89      0.89       690
   macro avg       0.86      0.88      0.87       690
weighted avg       0.89      0.89      0.89       690





## Checkin

In [33]:
text = """The rapid expansion of wireless services such as cellular voice, PCS
(Personal Communications Services), mobile data and wireless LANs
in recent years is an indication that signicant value is placed on accessibility
and portability as key features of telecommunication (Salkintzis and Mathiopoulos (Guest Ed.), 2000).
devices have maximum utility when they can be used \any-
where at anytime". One of the greatest limitations to that goal, how-
ever, is nite power supplies. Since batteries provide limited power, a
general constraint of wireless communication is the short continuous
operation time of mobile terminals. Therefore, power management is
y Corresponding Author: Dr. Krishna Sivalingam. Part of the research was
supported by Air Force Oce of Scientic Research grants F-49620-97-1-
0471 and F-49620-99-1-0125; by Telcordia Technologies and by Intel. Part of
the work was done while the rst author was at Washington State Univer-
sity. The authors' can be reached at cej@bbn.com, krishna@eecs.wsu.edu,
pagrawal@research.telcordia.com, jcchen@research.telcordia.com
c
2001 Kluwer Academic Publishers. Printed in the Netherlands.
Jones, Sivalingam, Agrawal and Chen
one of the most challenging problems in wireless communication, and
recent research has addressed this topic (Bambos, 1998). Examples include
a collection of papers available in (Zorzi (Guest Ed.), 1998) and
a recent conference tutorial (Srivastava, 2000), both devoted to energy
ecient design of wireless networks.
Studies show that the signicant consumers of power in a typical
laptop are the microprocessor (CPU), liquid crystal display (LCD),
hard disk, system memory (DRAM), keyboard/mouse, CDROM drive,
oppy drive, I/O subsystem, and the wireless network interface card
(Udani and Smith, 1996, Stemm and Katz, 1997). A typical example
from a Toshiba 410 CDT mobile computer demonstrates that nearly
36% of power consumed is by the display, 21% by the CPU/memory,
18% by the wireless interface, and 18% by the hard drive. Consequently,
energy conservation has been largely considered in the hardware design
of the mobile terminal (Chandrakasan and Brodersen, 1995) and in
components such as CPU, disks, displays, etc. Signicant additional
power savings may result by incorporating low-power strategies into
the design of network protocols used for data communication. This
paper addresses the incorporation of energy conservation at all layers
of the protocol stack for wireless networks.
The remainder of this paper is organized as follows. Section 2 introduces
the network architectures and wireless protocol stack considered
in this paper. Low-power design within the physical layer is brie
y
discussed in Section 2.3. Sources of power consumption within mobile
terminals and general guidelines for reducing the power consumed are
presented in Section 3. Section 4 describes work dealing with energy
ecient protocols within the MAC layer of wireless networks, and
power conserving protocols within the LLC layer are addressed in Section
5. Section 6 discusses power aware protocols within the network
layer. Opportunities for saving battery power within the transport
layer are discussed in Section 7. Section 8 presents techniques at the
OS/middleware and application layers for energy ecient operation.
Finally, Section 9 summarizes and concludes the paper.
2. Background
This section describes the wireless network architectures considered in
this paper. Also, a discussion of the wireless protocol stack is included
along with a brief description of each individual protocol layer. The
physical layer is further discussed. """

In [43]:
x_sample, lines = featurize_text(text)
x_sample_features = v.transform(x_sample)
y_sample = clf.predict(x_sample_features)

In [45]:
# for test we give first 4 characters from https://en.wikipedia.org/wiki/Hyphen#Unicode
hyphen_chars = {
                '\u002D', # HYPHEN-MINUS
                '\u00AD', # SOFT HYPHEN 
                '\u2010', # HYPHEN 
                '\u2011', # NON-BREAKING HYPHEN
               }

corrected_acc = []
for i, line in enumerate(lines):
    if i==0 or not y_sample[i]:
        corrected_acc.append(line)
    else:
        prev_line = lines[i-1]
        if prev_line in hyphen_chars:
            lines[i-1]=prev_line[:-1]
        lines[i-1] += line

corrected = ''.join(corrected_acc)

TypeError: sequence item 0: expected str instance, bool found