In [1]:
import re
from tqdm import tqdm
from utils import line_reader, write_line_to_file

import spacy
from spacy.tokens import Doc, Token

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
nlp = spacy.load('en_core_web_md', disable=['ner'])

In [4]:
HYPHEN_PATTERN = re.compile(r'[a-zA-Z]+(-[a-zA-Z]+)+')

def merge_hyphen_tokens(doc):
    inds = [m.span() for m in re.finditer(HYPHEN_PATTERN, doc.text)]
    for start, end in inds:
        doc.merge(start_idx=start, end_idx=end)
    return doc

nlp.add_pipe(merge_hyphen_tokens)

In [5]:
def custom_is_alpha(t):
    return t.is_alpha or re.match(HYPHEN_PATTERN, t.text)

def find_first_word_index(doc):
    i = 0
    found = doc[i]._.is_alpha
    while not found:
        i += 1
        found = doc[i]._.is_alpha
    return i

def find_last_word_index(doc):
    i = len(doc)-1
    found = doc[i]._.is_alpha
    while not found:
        i -= 1
        found = doc[i]._.is_alpha
    return i

def is_first_word(t):
    return True if t.i == t.doc._.first_word_ind else False

def is_last_word(t):
    return True if t.i == t.doc._.last_word_ind else False
        

Token.set_extension('is_alpha', getter=custom_is_alpha, force=True)
Token.set_extension('is_first_word', getter=is_first_word, force=True)
Token.set_extension('is_last_word', getter=is_last_word, force=True)
Doc.set_extension('first_word_ind', getter=find_first_word_index, force=True)
Doc.set_extension('last_word_ind', getter=find_last_word_index, force=True)

In [6]:
def format_headline(doc):
    headline = [t.text_with_ws.title() if upper_rules(t) else t.text_with_ws.lower() for t in doc]
    return ''.join(headline)


def upper_rules(token):
    first_last_word  = token._.is_first_word or token._.is_last_word
    subordinate_conj = (token.pos_ == 'ADP') and (token.dep_ == 'mark')
    upper_pos        = token.pos_ in ['NOUN', 'PROPN', 'ADV', 'VERB', 'ADJ', 'PRON']
    return first_last_word or subordinate_conj or upper_pos

In [7]:
HEADLINES_FILE = '../../../tasks/02-structural-linguistics/examiner-headlines.txt'
OUT_FILE = 'formatted_headlines.txt'

n_lines = sum(1 for _ in line_reader(HEADLINES_FILE))

In [8]:
headlines = line_reader(HEADLINES_FILE)
n_formatted = 0

for doc in tqdm(nlp.pipe(headlines), total=n_lines):
    headline = format_headline(doc)
    write_line_to_file(OUT_FILE, headline)
    
    if doc.text != headline:
        n_formatted += 1

100%|██████████| 5000/5000 [00:15<00:00, 317.99it/s]


In [9]:
assert format_headline(nlp('Do as you want')) == 'Do As You Want'
assert format_headline(nlp('How to use a Macbook as a table')) == 'How to Use a Macbook as a Table'
print('Tests passed')

Tests passed


In [10]:
doc = nlp('\'This\' headline is about self-reflection boy!')
format_headline(doc)

"'This' Headline Is about Self-Reflection Boy!"

In [11]:
'Number of headlines that were formatted: {}'.format(n_formatted)

'Number of headlines that were formatted: 4504'