### 2.1. Форматування

In [1]:
import spacy
nlp = spacy.load('en')

In [2]:
def tokenize(title: str) -> (list, int):
    "Returns a list of tokens and a total no. of words"
    
    start_pos: int = 0
    words_count: int = 0
    title_tokens = []
    
    doc = nlp(title)
    for token in doc:
        # count separate words
        if token.pos_ != 'PUNCT' and "'" not in token.text:
            words_count += 1
        title_tokens.append((token.text, token.pos_, token.dep_, words_count, token.whitespace_))
    
    return title_tokens, words_count

In [3]:
def capitalize(title: str) -> str:
    "Capitalizes a string according to the AP guidelines"
    
    title_tokens, total_words = tokenize(title)
    
    capitalized_first_word: bool = False
    capitalized_last_word: bool = False
    s: str = ''
    
    for (text, pos, dep, word_no, space_after) in title_tokens:
        to_be_capitilized: bool = False
        
        if not capitalized_first_word and word_no == 1:
            to_be_capitilized = True
            capitalized_first_word = True
        
        if not capitalized_last_word and word_no == total_words:
            to_be_capitilized = True
            capitalized_last_word = True
        
        if pos not in ['DET', 'CCONJ', 'ADP', 'PART', 'PUNCT', 'SYM', 'X'] \
        and "'" not in text:
            to_be_capitilized = True
        
        if pos == 'ADP' and dep != 'prep':
            to_be_capitilized = True
            
        # spaCy tags these words as determiners in all cases
        if text in ['this', 'these']:
            to_be_capitilized = True
        
        if to_be_capitilized:
            text = text[0].upper() + text[1:]
        
        s += text + space_after
    return s

In [4]:
with open('examiner-headlines.txt') as fo:
    titles = fo.readlines()

In [5]:
no_of_corrected: int = 0
with open('examiner-headlines_FORMATTED.txt', 'w+') as fw:
    for title in titles: 
        capitalized_title = capitalize(title) 
        fw.write(capitalized_title)
        if capitalized_title != title:
            no_of_corrected += 1

In [6]:
print(f'{no_of_corrected} out of {len(titles)} titles have been corrected.')

4213 out of 5000 titles have been corrected.
