In [13]:
#!python -m spacy download en_core_web_md

In [1]:
from nltk.corpus import sentiwordnet as swn
import spacy
import re

nlp = spacy.load('en_core_web_md')

Read, preprocess and tokenize headlines

In [3]:
NON_BREAKING_HYPHEN = u'\u2011'

def replace_hyphens(headline_str):
    headline_ch_list = list(headline_str)
    for m in re.finditer(r'[a-zA-Z]-[a-zA-Z]', headline_str):
        headline_ch_list[m.start() + 1] = NON_BREAKING_HYPHEN

    return "".join(headline_ch_list)

In [4]:
with open('examiner-headlines.txt') as f:
    headlines_strs = f.readlines()

headlines_strs = [replace_hyphens(headline_str) for headline_str in headlines_strs]
tokenized_headlines = [nlp(headline_str) for headline_str in headlines_strs]

In [44]:
def to_upper(text, pos = 0):
    items = list(text)
    items[pos] = items[pos].upper()
    return ''.join(items)
def to_lower(text, pos = 0):
    items = list(text)
    items[pos] = items[pos].lower()
    return ''.join(items)

assert to_upper('kyiv') == 'Kyiv'
assert to_upper('kyiv', 2) == 'kyIv'
assert to_lower('Kyiv') == 'kyiv'
assert to_lower('KyIv', 2) == 'Kyiv'

In [58]:
def hyphen_upper(text):
    text_parts = text.split(NON_BREAKING_HYPHEN)
    for i, text_part in enumerate(text_parts):
        text_parts[i] = to_upper(text_part)
    return NON_BREAKING_HYPHEN.join(text_parts)

assert hyphen_upper(u'a\u2011a') == 'A‑A'
assert hyphen_upper(u'ab\u2011ab\u2011ab') == 'Ab‑Ab‑Ab'

In [31]:
def should_ignore(token):
    if '\'' in token.text:
        return True
    if not token.text[1:].islower():
        return True
    if not token.text.isalpha():
        return True

    return False

assert should_ignore(nlp('CQG'))
assert should_ignore(nlp('SimCorp'))
assert should_ignore(nlp('2go'))
assert should_ignore(nlp('n\'t'))
assert not should_ignore(nlp('word'))
assert not should_ignore(nlp('Word'))

In [91]:
def correct_headline(sentence_doc):
    corrected_str = ''

    last_check_pos = len(sentence_doc) - 1
    if sentence_doc[last_check_pos].pos_ == 'PUNCT':
        last_check_pos -= 1
    
    for token in sentence_doc:
        token_text = token.text
        if token.pos_ in ['NOUN', 'PRON', 'VERB', 'ADJ', 'ADV', 'ADP']:
            token_text = to_upper(token_text)
        if token.pos_ in ['DET', 'CONJ', 'PART', 'INTJ']:
            token_text = to_lower(token_text)
        if token.pos_ == 'ADP' and token.dep_ == 'prep':
            token_text = to_lower(token_text)

        if token.i == 0 or token.i == last_check_pos:
            token_text = to_upper(token_text)

        if NON_BREAKING_HYPHEN in token_text:
            token_text = hyphen_upper(token_text)

        corrected_str += token_text + ' '
    return corrected_str


print(correct_headline(nlp('Do as you want')))
print(correct_headline(nlp('How to use a Macbook as a table')))

print(correct_headline(nlp('as ww')))
print(correct_headline(nlp('as ww?')))

print(correct_headline(nlp(u'How to u\u2011s Macbook')))

Do As You Want 
How to Use a Macbook as a Table 
As Ww 
As Ww ? 
How To U‑S Macbook 


In [88]:
def has_change(text, to_upper):
    if to_upper:
        return list(text)[0].islower()
    else:
        return list(text)[0].isupper()

assert has_change('text', True)
assert has_change('Text', False)

In [95]:
res = []
for headline in tokenized_headlines:
    res += [correct_headline(headline)]

In [97]:
with open('corrected_headlines.txt', 'w') as f:
    for line in res:
        f.write(line)