In [1]:
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import numpy as np
import nltk
import spacy
import spacy.symbols as ss
from spacy import displacy
import en_core_web_lg
import pandas as pd
pd.set_option('display.max_colwidth', 300)

In [2]:
nltk.download('sentiwordnet')
nltk.download('wordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/serhii/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/serhii/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
nlp = en_core_web_lg.load()

In [130]:
df = pd.read_table("../../../../../tasks/02-structural-linguistics/examiner-headlines.txt", names=['title'])

In [131]:
df = df.assign(doc=df['title'].apply(nlp))

In [132]:
def has_proper_name(doc):
    return len([tok for tok in doc if tok.pos == ss.PROPN]) != 0

In [133]:
df = df.assign(has_propn = df['doc'].apply(lambda x: has_proper_name(x)))

In [134]:
def spacy_pos2wordnet_pos(pos):
    if pos == ss.NOUN:
        return wn.NOUN
    elif pos == ss.VERB:
        return wn.VERB
    elif pos == ss.ADV:
        return wn.ADV
    elif pos == ss.ADJ:
        return wn.ADJ

In [135]:
def is_content_word(tok):
    return tok.pos == ss.NOUN or tok.tag_.startswith('V') or tok.pos == ss.ADJ or tok.pos == ss.ADV

In [136]:
def get_mean_sentiment(tok, topn):
    wn_pos = spacy_pos2wordnet_pos(tok.pos)
    if not wn_pos:
        return np.nan
    synsets = list(swn.senti_synsets(tok.lemma_, wn_pos))[:topn]
    if len(synsets) == 0:
        return np.nan
    sentiments = map(lambda node: node.pos_score(), synsets)
    mean_sentiment = np.mean(list(sentiments))
    return mean_sentiment

In [137]:
def has_positive_sentiment(doc):
    content_words = filter(is_content_word, doc)
    sentiments = [get_mean_sentiment(tok, topn=5) for tok in content_words]
    if len(sentiments) == 0:
        return False
    else:
        return np.nanmax(sentiments) >= 0.5

In [138]:
df = df.assign(pos_sentiment = df['doc'].apply(lambda x: has_positive_sentiment(x)))

  import sys


In [139]:
def has_superlatives(doc):
    superlatives = list(filter(lambda tok: tok.tag_ in ('JJS', 'RBS'), doc))
    return len(superlatives) != 0

In [140]:
df = df.assign(has_superlatives=df['doc'].apply(has_superlatives))

In [141]:
df[df['has_propn'] | df['pos_sentiment'] | df['has_superlatives']]

Unnamed: 0,title,doc,has_propn,pos_sentiment,has_superlatives
0,Halep enters Rogers Cup final in straight sets win over Errani,"(Halep, enters, Rogers, Cup, final, in, straight, sets, win, over, Errani)",True,False,False
1,The phantoms of St. Mary's,"(The, phantoms, of, St., Mary, 's)",True,False,False
2,Talladega turmoil could spell trouble for NASCAR's Chase field,"(Talladega, turmoil, could, spell, trouble, for, NASCAR, 's, Chase, field)",True,False,False
3,Burn those calories! Try the Very Steep Trail.,"(Burn, those, calories, !, Try, the, Very, Steep, Trail, .)",True,False,False
5,2011-2012 NHL team preview: Detroit Red Wings,"(2011, -, 2012, NHL, team, preview, :, Detroit, Red, Wings)",True,False,False
6,Cal coach Jeff Tedford taking a different approach in 2010 -- Part 1,"(Cal, coach, Jeff, Tedford, taking, a, different, approach, in, 2010, --, Part, 1)",True,False,False
7,Google science fair to encourage STEM learning,"(Google, science, fair, to, encourage, STEM, learning)",True,False,False
8,SF Beer Week 2013: what's for dinner (part 2),"(SF, Beer, Week, 2013, :, what, 's, for, dinner, (, part, 2, ))",True,False,False
9,Jersey Shore Season 6 cast's salaries revealed; More than President Obama!,"(Jersey, Shore, Season, 6, cast, 's, salaries, revealed, ;, More, than, President, Obama, !)",True,False,False
12,Diy repurposed sewing table project,"(Diy, repurposed, sewing, table, project)",True,False,False


In [142]:
df[df['has_superlatives']]

Unnamed: 0,title,doc,has_propn,pos_sentiment,has_superlatives
63,"Ooh la la! What a most sensational woman Josephine Baker was, and is!","(Ooh, la, la, !, What, a, most, sensational, woman, Josephine, Baker, was, ,, and, is, !)",True,False,True
135,"Best 2014 Black Friday TV deals online: Amazon, Best Buy, Walmart, Target","(Best, 2014, Black, Friday, TV, deals, online, :, Amazon, ,, Best, Buy, ,, Walmart, ,, Target)",True,True,True
224,Live video and latest updates on the BP oil spill capping procedure,"(Live, video, and, latest, updates, on, the, BP, oil, spill, capping, procedure)",True,False,True
318,"Enrique Iglesias, Vergara among People En Espanols 'Most Beautiful' 2011","(Enrique, Iglesias, ,, Vergara, among, People, En, Espanols, ', Most, Beautiful, ', 2011)",True,True,True
344,Latest Product Recalls of Children's Medicine,"(Latest, Product, Recalls, of, Children, 's, Medicine)",True,False,True
348,"'The Good, The Best & The Most Disappointing Movie Soundtracks of 2013'","(', The, Good, ,, The, Best, &, The, Most, Disappointing, Movie, Soundtracks, of, 2013, ')",True,True,True
449,Albuquerque NM is one of the leanest cities in the USA! Amazing.,"(Albuquerque, NM, is, one, of, the, leanest, cities, in, the, USA, !, Amazing, .)",True,True,True
473,'The Biggest Loser' 2012 video: face-off week brings the competition,"(', The, Biggest, Loser, ', 2012, video, :, face, -, off, week, brings, the, competition)",True,False,True
563,"Robert Pattinson, Taylor Lautner, Kellan Lutz top People's Sexiest Men list","(Robert, Pattinson, ,, Taylor, Lautner, ,, Kellan, Lutz, top, People, 's, Sexiest, Men, list)",True,True,True
702,World's oldest albatross revealed,"(World, 's, oldest, albatross, revealed)",True,False,True
