In [1]:
from nltk.corpus import sentiwordnet as swn
import spacy
import re

nlp = spacy.load('en_core_web_md')

In [63]:
def has_virus_entities(headline_doc):
    for ent in headline_doc.ents:
        if ent.label_ in ['PERSON', 'NORP', 'FAC', 'ORG', 'EVENT', 'WORK_OF_ART', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL']:
            return True
    return False

assert not has_virus_entities(nlp('by train'))
assert has_virus_entities(nlp('Bill Gates'))
assert has_virus_entities(nlp('European commission'))
assert has_virus_entities(nlp('by Google'))
assert has_virus_entities(nlp('$1 billion'))

In [59]:
from nltk.corpus import sentiwordnet as swn

def has_positive_synset(token):
    def get_swn_tag(token_tag):
        swn_tag = ''
        if token_tag in ['NN', 'NNP', 'NNPS', 'NNS']:
            swn_tag = 'n'
        if token_tag in ['VB', 'VBD', 'VBG', 'VBP', 'VBN', 'VBZ']:
            swn_tag = 'v'
        if token_tag in ['JJ', 'JJR', 'JJS']:
            swn_tag = 'a'
        if token_tag in ['RB', 'RBR', 'RBS']:
            swn_tag = 'r'
        return swn_tag

    swn_tag = get_swn_tag(token.tag_)
    if not swn_tag:
        return False

    synsets = swn.senti_synsets(token.lemma_, swn_tag)
    pos_scores = [synset.pos_score() for synset in synsets]
    if pos_scores:
        return max(pos_scores) > 0.6
    else:
        return False

def is_positive(headline_str):
    for token in headline_str:
        if has_positive_synset(token):
            return True
    return False

assert is_positive(nlp('I am happy'))
assert not is_positive(nlp('nothing happened'))

In [60]:
def has_comparison(headline_doc):
    for token in headline_doc:
        if token.tag_ in ['JJR', 'JJS', 'RBR', 'RBS']:
            return True
    return False           

assert not has_comparison(nlp('by train'))
assert has_comparison(nlp('most beautifull'))

In [2]:
NON_BREAKING_HYPHEN = u'\u2011'

def replace_hyphens(headline_str):
    headline_ch_list = list(headline_str)
    for m in re.finditer(r'[a-zA-Z]-[a-zA-Z]', headline_str):
        headline_ch_list[m.start() + 1] = NON_BREAKING_HYPHEN

    return "".join(headline_ch_list)

In [3]:
with open('examiner-headlines.txt') as f:
    headlines_strs = f.readlines()

headlines_strs = [replace_hyphens(headline_str) for headline_str in headlines_strs]
tokenized_headlines = [nlp(headline_str) for headline_str in headlines_strs]

In [54]:
with_virus_entities = 0
positives = 0
with_comparisons = 0

for headline_doc in tokenized_headlines:
    if has_virus_entities(headline_doc):
        with_virus_entities += 1
    if is_positive(headline_doc):
        positives += 1
    if has_comparison(headline_doc):
        with_comparisons += 1


In [56]:
print('With virus entities: ', with_virus_entities / len(tokenized_headlines))
print('Positive: ', positives / len(tokenized_headlines))
print('With comparisons: ', with_comparisons / len(tokenized_headlines))

With virus entities:  0.6254
Positive:  0.265
With comparisons:  0.046
