In [86]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import numpy as np
import nltk
import spacy
import spacy.symbols as ss
from spacy import displacy
import en_core_web_lg
import pandas as pd
pd.set_option('display.max_colwidth', 300)

In [34]:
nltk.download('sentiwordnet')
nltk.download('wordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /usr/local/lib/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/local/lib/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [6]:
nlp = en_core_web_lg.load()

In [40]:
df = pd.read_table("../../../../../tasks/02-structural-linguistics/examiner-headlines.txt", names=['title'])[:50]

In [41]:
df = df.assign(doc=df['title'].apply(nlp))

In [42]:
def has_proper_name(doc):
    return len([tok for tok in doc if tok.pos == ss.PROPN]) != 0

In [43]:
df = df.assign(has_propn = df['doc'].apply(lambda x: has_proper_name(x)))

In [68]:
def spacy_pos2wordnet_pos(pos):
    if pos == ss.NOUN:
        return wn.NOUN
    elif pos == ss.VERB:
        return wn.VERB
    elif pos == ss.ADV:
        return wn.ADV
    elif pos == ss.ADJ:
        return wn.ADJ

In [62]:
def is_content_word(tok):
    return tok.pos == ss.NOUN or tok.tag_.startswith('V') or tok.pos == ss.ADJ or tok.pos == ss.ADV

In [108]:
def get_mean_sentiment(tok, topn):
    wn_pos = spacy_pos2wordnet_pos(tok.pos)
    if not wn_pos:
        return np.nan
    synsets = list(swn.senti_synsets(tok.lemma_, wn_pos))[:topn]
    if len(synsets) == 0:
        return np.nan
    sentiments = map(lambda node: node.pos_score(), synsets)
    mean_sentiment = np.mean(list(sentiments))
    return mean_sentiment

In [124]:
def has_positive_sentiment(doc):
    content_words = filter(is_content_word, doc)
    sentiments = [get_mean_sentiment(tok, topn=5) for tok in content_words]
    if len(sentiments) == 0:
        return False
    else:
        return np.nanmax(sentiments) >= 0.5

In [125]:
df.assign(sentiment = df['doc'].apply(lambda x: has_positive_sentiment(x)))

Unnamed: 0,title,doc,has_propn,sentiment
0,Halep enters Rogers Cup final in straight sets win over Errani,"(Halep, enters, Rogers, Cup, final, in, straight, sets, win, over, Errani)",True,False
1,The phantoms of St. Mary's,"(The, phantoms, of, St., Mary, 's)",True,False
2,Talladega turmoil could spell trouble for NASCAR's Chase field,"(Talladega, turmoil, could, spell, trouble, for, NASCAR, 's, Chase, field)",True,False
3,Burn those calories! Try the Very Steep Trail.,"(Burn, those, calories, !, Try, the, Very, Steep, Trail, .)",True,False
4,It's the end of the world... and I feel fine,"(It, 's, the, end, of, the, world, ..., and, I, feel, fine)",False,False
5,2011-2012 NHL team preview: Detroit Red Wings,"(2011, -, 2012, NHL, team, preview, :, Detroit, Red, Wings)",True,False
6,Cal coach Jeff Tedford taking a different approach in 2010 -- Part 1,"(Cal, coach, Jeff, Tedford, taking, a, different, approach, in, 2010, --, Part, 1)",True,False
7,Google science fair to encourage STEM learning,"(Google, science, fair, to, encourage, STEM, learning)",True,False
8,SF Beer Week 2013: what's for dinner (part 2),"(SF, Beer, Week, 2013, :, what, 's, for, dinner, (, part, 2, ))",True,False
9,Jersey Shore Season 6 cast's salaries revealed; More than President Obama!,"(Jersey, Shore, Season, 6, cast, 's, salaries, revealed, ;, More, than, President, Obama, !)",True,False


In [79]:
list(swn.senti_synsets('enter', 'v'))

[SentiSynset('enter.v.01'),
 SentiSynset('enter.v.02'),
 SentiSynset('enroll.v.01'),
 SentiSynset('figure.v.02'),
 SentiSynset('record.v.01'),
 SentiSynset('enter.v.06'),
 SentiSynset('accede.v.02'),
 SentiSynset('insert.v.01'),
 SentiSynset('embark.v.02')]

In [66]:
[(tok.text, tok.tag_) for tok in nlp("It's the end of the world... and I feel fine")]

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('the', 'DT'),
 ('end', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('...', 'NFP'),
 ('and', 'CC'),
 ('I', 'PRP'),
 ('feel', 'VBP'),
 ('fine', 'JJ')]