In [2]:
task_dir = '../../../../tasks/02-structural-linguistics/'

from IPython.display import display, Markdown

with open(task_dir + '2-headlines.md', 'r') as fh:
    content = fh.read()

display(Markdown(content))

## News Headlines

### 1. Formating

[The Associated Press Stylebook](https://www.amazon.com/Associated-Press-Stylebook-2017-Briefing/dp/0465093043/) is a style guide widely used among American journalists. It enforces the following rules for capitalization of news headlines:

1. Capitalize nouns, pronouns, adjectives, verbs, adverbs, and subordinate conjunctions. If a word is hyphenated, every part of the word should be capitalized (e.g., "Self-Reflection" not "Self-reflection").
2. Capitalize the first and the last word.
3. Lowercase all other parts of speech: articles, coordinating conjunctions, prepositions, particles, interjections.

Write a program that formats a headline according to the rules above. Use any programming language and any NLP toolkit.

When done, run your program on [the corpus of headlines from The Examiner](examiner-headlines.txt) and submit your program and a file with corrected headlines to your directory. Output statistics: how many titles were properly formatted?

### 2. Catch catchy headlines

The paper on [Automatic Extraction of News Values from Headline Text](http://www.aclweb.org/anthology/E17-4007) defines that a catchy headline has the following features:
1. Prominence
2. Sentiment
3. Superlativeness
4. Proximity
5. Surprise
6. Uniqueness

Write a program that analyzes a headline for prominence (a.k.a, named entities), sentiment, and superlativeness. For sentiment, check if the average sentiment for the top 5 meanings of word+POS in [SentiWordNet](http://sentiwordnet.isti.cnr.it/) is above 0.5.

When done, run your program on [the corpus of headlines](examiner-headlines.txt), extract the headlines that have at least one of the described features, and submit your program and a file with catchy headlines to your directory.

### Additional notes

The data set was borrowed from https://www.kaggle.com/therohk/examine-the-examiner.


In [3]:
with open(task_dir + 'examiner-headlines.txt', 'r') as f:
    headers = f.readlines()
    headers = [h.strip() for h in headers]

# I. Formatting

In [3]:
import spacy
import regex as re
from spacy.tokenizer import Tokenizer

infix_re = re.compile(r'''[~]''')
# Overriding splitting on hyphen
def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, infix_finditer=infix_re.finditer)

nlp = spacy.load('en_core_web_md')
nlp.tokenizer = custom_tokenizer(nlp)

In [4]:
print(headers[0:3])

['Halep enters Rogers Cup final in straight sets win over Errani', "The phantoms of St. Mary's", "Talladega turmoil could spell trouble for NASCAR's Chase field"]


In [5]:
def format(headline):
    doc = nlp(headline);
    tks = [token for token in doc]
    needs_capitalization = [False for tk in tks]
    # Capitalize nouns, pronouns, adjectives, verbs, adverbs, and subordinate conjunctions.
    to_capitalize = ["PROPN", "NOUN", "PRON", "ADJ", "VERB", "ADV", "SCONJ"]

    for index, token in enumerate(doc):
        if (token.pos_ in to_capitalize):
            needs_capitalization[index] = True
        # Capitalize the first and the last word.
        needs_capitalization[0] = True
        needs_capitalization[-1] = True
    res = ""
    for index, token in enumerate(doc):
        if (needs_capitalization[index]):
            res += token.text.title() + ' ' 
        else:
            res += token.text.lower() + ' ' 
    return res.strip()

In [6]:
fixed_headlines = [format(h) for h in headers]

In [7]:
f = open('format_output.txt','w')
f.write("\n".join(fixed_headlines))
f.close()
print("Output saved to format_output.txt")

Output saved to format_output.txt


In [8]:
total_count = 0
fixed_count = 0
for original, fixed in zip(headers, fixed_headlines):
    total_count += 1
    if not original == fixed:
        fixed_count += 1
print("Fixed %s of total %s (%s %%)" % (fixed_count, total_count, fixed_count / total_count * 100))

Fixed 4541 of total 5000 (90.82000000000001 %)


# II Catch catchy headlines

In [4]:
import spacy
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

In [5]:
nlp = spacy.load('en_core_web_md')

In [6]:
def senti_pos(pos):
    if (pos == 'VERB'):
        return wn.VERB
    if (pos == 'NOUN'):
        return wn.NOUN
    if (pos == 'ADJ'):
        return wn.ADJ
    if (pos == 'ADV'):
        return wn.ADV
    return None
def get_avg_sent(word, pos = None):
    synset = list(swn.senti_synsets(word, senti_pos(pos)))[0:5]
    count = len(synset)
    if (count == 0):
        return 0
    total_pos = 0
    total_neg = 0
    for syn in synset:
        total_pos += syn.pos_score()
        total_neg += syn.neg_score()
    return (total_pos - total_neg) / count

In [13]:
def is_prominent(headline):
    doc = nlp(headline);
    if (len(doc.ents) > 0):
        return True
    tks = [token for token in doc]
    sent = 0
    for token in tks:
        if token.tag_ in ['JJS', 'RBS']:
            return True
        sent += get_avg_sent(str(token), token.pos_)
    if (sent / len(tks)) > 0.5:
        return True
    return False

In [14]:
with open('catchy_output.txt', 'w') as out:
    for  header in headers:
        if (is_prominent(header)):
            out.write(header)
            out.write("\n")
print('Done writing in catchy_output.txt')

Done writing in catchy_output.txt
