In [3]:
import re
from tqdm import tqdm

import spacy
from spacy.tokens import Doc, Token
from spacy import displacy

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [49]:
HYPHEN_PATTERN = re.compile(r'[a-zA-Z]+(-[a-zA-Z]+)+')

def custom_is_alpha(t):
    return t.is_alpha or re.match(HYPHEN_PATTERN, t.text)

def find_first_word_index(doc):
    i = 0
    found = doc[i]._.is_alpha
    while not found:
        i += 1
        found = custom_is_alpha(doc[i])
    return i

def find_last_word_index(doc):
    i = len(doc)-1
    found = custom_is_alpha(doc[i])
    while not found:
        i -= 1
        found = custom_is_alpha(doc[i])
    return i

def is_first_word(t):
    return True if t.i == t.doc._.first_word_ind else False

def is_last_word(t):
    return True if t.i == t.doc._.last_word_ind else False
        

Token.set_extension('is_alpha', getter=custom_is_alpha, force=True)
Token.set_extension('is_first_word', getter=is_first_word, force=True)
Token.set_extension('is_last_word', getter=is_last_word, force=True)
Doc.set_extension('first_word_ind', getter=find_first_word_index, force=True)
Doc.set_extension('last_word_ind', getter=find_last_word_index, force=True)

In [6]:
def get_line(file_name):
    with open(file_name, 'r') as f:
        for line in f:
            yield line.strip()

In [33]:
headlines_file = '../../../tasks/02-structural-linguistics/examiner-headlines.txt'
n_lines = sum(1 for _ in get_line(headlines_file))

In [51]:
nlp = spacy.load('en_core_web_md', disable=['ner'])

In [52]:
nlp.add_pipe(merge_hyphen_tokens)

In [53]:
headlines = get_line(headlines_file)
out_file = 'formatted_headlines.txt'
n_formatted = 0

for doc in tqdm(nlp.pipe(headlines), total=n_lines):
    headline = format_headline(doc)
    write_to_file(out_file, headline)
    
    if doc.text != headline:
        n_formatted += 1

100%|██████████| 5000/5000 [00:14<00:00, 357.13it/s]


In [50]:
def merge_hyphen_tokens(doc):
    inds = [m.span() for m in re.finditer(HYPHEN_PATTERN, doc.text)]
    for start, end in inds:
        doc.merge(start_idx=start, end_idx=end)
    return doc

In [28]:
def format_headline(doc):
    headline = [t.text_with_ws.title() if upper_rules(t) else t.text_with_ws.lower() for t in doc]
    return ''.join(headline)


def upper_rules(token):
    first_last_word  = token._.is_first_word or token._.is_last_word
    subordinate_conj = (token.pos_ == 'ADP') and (token.dep_ == 'mark')
    upper_pos        = token.pos_ in ['NOUN', 'PROPN', 'ADV', 'VERB', 'ADJ', 'PRON']
    return first_last_word or subordinate_conj or upper_pos
    

In [10]:
def write_to_file(file_name, text):
    with open(file_name, 'a') as f:
        f.write('{}\n'.format(text))

In [18]:
assert format_headline(nlp('Do as you want')) == 'Do As You Want'
assert format_headline(nlp('How to use a Macbook as a table')) == 'How to Use a Macbook as a Table'
# assert format_headline(nlp('headline about self-reflection boy')) == ''
print('Tests passed')

Tests passed


In [19]:
doc = nlp('\'This\' headline is about self-reflection boy, but not you!')
regex_merge_token(doc, pattern)
format_headline(doc)

"'This' Headline Is about Self-Reflection Boy, but Not You!"

In [20]:
format_headline(nlp('Orange-tastic!'))

'Orange-Tastic!'

In [22]:
doc = nlp('Orange-tastic!')
regex_merge_token(doc, pattern)
format_headline(doc)

'Orange-Tastic!'

In [48]:
n_formatted

4504