In [2]:
with open('examiner-headlines.txt', encoding='utf8') as f:
    content = f.readlines()
headlines = [x.strip() for x in content] 

In [4]:
len(headlines)

5000

In [69]:
import spacy

class HeaderMaker:
    def __init__(self):
        self.nlp = spacy.load('en')
        
    #capitalized_rules
    def capitalize(self, word):
        res= word[0].upper()
        for i in range(1, len(word)):
            next_char = word[i] 
            if (word[i-1]=='-') or (((i+1)<len(word)) and (word[i+1]=='-')):
                next_char=next_char.upper()
            res=res+next_char
        return res
    #print(capitalize('a a-a---a bcd'))

    #Capitalize nouns, pronouns, adjectives, verbs, adverbs, and subordinate conjunctions.
    def part_of_speach_rule(self, token, sent):
        pos_tags=['NOUN','PRON','PROPN','ADJ','VERB','ADV','SCONJ']
        return token.pos_ in pos_tags
    
    #If a word is hyphenated, every part of the word should be capitalized (e.g., "Self-Reflection" not "Self-reflection").
    def hyphen_rule(self, token, sent):
        return '-' in token.text
    
    #Capitalize the first and the last word.
    def first_last_word_rule(self, token, sent):
        return (token.idx==0) or ((token.idx+len(token.text))==len(sent))

    def rule_list(self, token, sent):
        rules = [self.hyphen_rule,self.first_last_word_rule,self.part_of_speach_rule]
        for rule in rules:
            if rule(token, sent):
                return rule.__name__
        return 'lower_case'
    
    def process(self, sent):
        parsed_sentence = self.nlp(sent)
        info = {}
        new_title=""
        validation_errors=[]
        for token in parsed_sentence:
            item ={}
            item['rule']=self.rule_list(token,sent)
            item['correct_text']=capitalize(token.text) if item['rule']!='lower_case' else token.text.lower()
            item['is_correct']=token.text == item['correct_text']
            info[token]=item
            
            if item['is_correct']==False:
                validation_errors.append(item['rule'])
            
            corect_word = token.text_with_ws.replace(token.text, item['correct_text'])
            new_title=new_title+corect_word
        return new_title, validation_errors, len(validation_errors)==0

In [75]:
maker =HeaderMaker()
sent = "Talladega turmoil could spell trouble for NASCAR's Chase field"
print(sent)
maker.process(sent)


Talladega turmoil could spell trouble for NASCAR's Chase field


("Talladega Turmoil Could Spell Trouble for NASCAR's Chase Field",
 ['part_of_speach_rule',
  'part_of_speach_rule',
  'part_of_speach_rule',
  'part_of_speach_rule',
  'first_last_word_rule'],
 False)

In [76]:
new_headlines=[]
all_errors=[]
correct_titles=0
maker =HeaderMaker()
for sent in headlines:
    correct_title, errors, is_correct =  maker.process(sent)
    correct_titles= correct_titles+int(is_correct)
    all_errors=all_errors+errors
    new_headlines.append(correct_title)

In [72]:
correct_titles

628

In [74]:
import collections
collections.Counter(all_errors)

Counter({'first_last_word_rule': 2294,
         'hyphen_rule': 43,
         'lower_case': 533,
         'part_of_speach_rule': 13848})

In [78]:
thefile = open('correct-headlines.txt', 'w',encoding='utf8')
for item in new_headlines:
    thefile.write("%s\n" % item)