In [1]:
import sys
print(sys.path)

In [2]:
import spacy
from spacy import displacy
#!python -m spacy download en_core_web_lg

nlp_en = spacy.load('en_core_web_lg')

# 1. Форматування

In [3]:
doc = nlp_en('Back to school, gluten-free style!')

displacy.render(doc, style='dep', jupyter=True)

doc2 = nlp_en("'Breaking Bad' prepares for victory lap with tonight's final season premiere")
displacy.render(doc2, jupyter=True)

In [4]:
for t in doc:
    print(f"'{t}': {t.pos_}")

'Back': ADV
'to': ADP
'school': NOUN
',': PUNCT
'gluten': NOUN
'-': PUNCT
'free': ADJ
'style': NOUN
'!': PUNCT


Нажаль, за допомогою спейси не можу одержати правильну маркіровку слова as, як це зазначено у завданні:
```
Зверніть увагу, що ваша програма повинна правильно розрізняти прийменники та підрядні сполучники. Наприклад, Do as you want => Do As You Want (бо "as" тут є сполучником), but How to use a Macbook as a table => How to Use a Macbook as a Table (бо "as" тут є прийменником).
```

In [5]:
for t in nlp_en('Do as you want'):
    print(f"'{t}': {t.pos_}")
print('--------------')
for t in nlp_en('How to use a Macbook as a table'):
    print(f"'{t}': {t.pos_}")

'Do': VERB
'as': ADP
'you': PRON
'want': VERB
--------------
'How': ADV
'to': PART
'use': VERB
'a': DET
'Macbook': PROPN
'as': ADP
'a': DET
'table': NOUN


In [6]:
from enum import Enum

class CaseModifier(Enum):
    CAPITALIZE = 0
    LOWERCASE = 1
    ASIS = 2

def APS_styler(head: str)->str:
    from array import array
    """
    function returns reformatted head.
    """
    hdoc = nlp_en(head)
    operations = [CaseModifier.ASIS for t in hdoc]
    
    pos_upper = {"NOUN", "PROPN", "ADJ", "VERB", "ADV", "PRON"}
    
    first = next((i for i, t in enumerate(hdoc) if t.pos_!='PUNCT'), -1)
    # print(first)
    last = next((i for i, t in reversed(list(enumerate(hdoc))) if t.pos_!='PUNCT'), -1)
    # print(last)
    
    for i, t in enumerate(hdoc):
        if t.pos_=='PUNCT': 
            continue
        t_str: str = str(t)
        if not (t_str.istitle() or t_str.islower()):
            # abbreviation
            continue
        if i in (first, last) or t.pos_ in pos_upper:
            operations[i] = CaseModifier.CAPITALIZE
            continue
        operations[i] = CaseModifier.LOWERCASE
    
    # construct result
    result: str = head
    for i, op in enumerate(operations):
        t = hdoc[i]
        if op==CaseModifier.ASIS:
            continue
        elif op==CaseModifier.CAPITALIZE:
            new_value = str(t).capitalize()
        elif op==CaseModifier.LOWERCASE:
            new_value = str(t).lower()
        result = result[:t.idx]+new_value+result[t.idx+len(t):]

    #print(operation)
    return result
    

In [7]:
print(APS_styler('Back to school, gluten-free style!'))
print(APS_styler('my Self-reflection'))
print(APS_styler("--..- . .. -!"))
print(APS_styler("Do as you want"))
print(APS_styler("How to use a Macbook as a table"))

# It looks like we properly process the most of hyphernated tokens 
# but can't process properly "as" even with the big english spacy model.

Back to School, Gluten-Free Style!
My Self-Reflection
--..- . .. -!
Do as You Want
How to Use a Macbook as a Table


In [8]:
from tqdm import tqdm_notebook as tqdm

with open('examiner-headlines.txt', 'r') as inp:
    with open('examiner-headlines-corrected.txt', 'w') as outp:
        total = 0
        corrected_num = 0
        for head in tqdm(inp, total=5000):
            head = head.strip()
            corrected = APS_styler(head)
            total += 1
            if corrected!=head:
                corrected_num += 1
            print(corrected, file=outp)
        print(f"Total headlines: {total}, corrected headlines: {corrected_num} ({corrected_num/total*100}%)")
            

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))


Total headlines: 5000, corrected headlines: 4366 (87.32%)


# 2. Вірусні новини

In [9]:
import nltk
from nltk.corpus import sentiwordnet as swn
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
list(swn.senti_synsets('slow'))


[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\ssotn\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ssotn\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ssotn\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[SentiSynset('decelerate.v.01'),
 SentiSynset('slow.v.02'),
 SentiSynset('slow.v.03'),
 SentiSynset('slow.a.01'),
 SentiSynset('slow.a.02'),
 SentiSynset('dense.s.04'),
 SentiSynset('slow.a.04'),
 SentiSynset('boring.s.01'),
 SentiSynset('dull.s.08'),
 SentiSynset('slowly.r.01'),
 SentiSynset('behind.r.03')]

In [10]:
s = next(swn.senti_synsets('slow'))
print(dir(s))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '__weakref__', '_neg_score', '_obj_score', '_pos_score', 'neg_score', 'obj_score', 'pos_score', 'synset', 'unicode_repr']


In [11]:
from collections import namedtuple
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.stem import WordNetLemmatizer

HeadlineProps = namedtuple('HeadlineProps', ['has_ents', 'is_emotional', 'has_superlativeness'])
lemmatizer = WordNetLemmatizer()

def is_headline_viral(head: str)->str:
    proper_ent_types = {'PERSON', 'ORG'}
    doc = nlp_en(head)
    has_ents = False
    is_emotional = False
    has_superlativeness = False
    
    for ent in doc.ents:
        if ent.label_ in proper_ent_types:
            has_ents = True
            break
            
    ndoc = nltk.word_tokenize(head)
    pos_tags = nltk.pos_tag(ndoc)
    
    superl_tags = {'JJR', 'JJS', 'RBS', 'RBR'}
    for word, pos in pos_tags:
        if pos in superl_tags:
            has_superlativeness = True
            break 

    emo_tags = {'NN':'n', 'NNS':'n', 'NNP':'n', 'NNPS':'n', 
                'JJ':'a', 'JJR':'a', 'JJS':'a',
                'RB':'r', 'RBR':'r', 'RBS':'r', 
                'VB':'v', 'VBD':'v', 'VBG':'v', 'VBN':'v', 'VBP':'v', 'VBZ':'v' }
    
    for word, pos in pos_tags:
        if pos in emo_tags:
            lemma = lemmatizer.lemmatize(word)
            e_pos = emo_tags[pos]
            synset = list(swn.senti_synsets(lemma, e_pos))[:5]
            l = len(synset)
            if l > 0:
                e_neg = sum(s.neg_score() for s in synset) / l
                e_pos = sum(s.pos_score() for s in synset) / l
                if max(e_neg, e_pos)>=0.5:
                    is_emotional = True;
                    break
                
            
    
    return HeadlineProps(has_ents=has_ents, is_emotional=is_emotional, has_superlativeness=has_superlativeness)

In [12]:
print(is_headline_viral('Kate Gosselin prepares for Hurricane Sandy'))
print(is_headline_viral("New 'Guardians of the Galaxy' featurette posted by James Gunn on Facebook"))
print(is_headline_viral("The Land Before Income Taxes"))
print(is_headline_viral('The Good, The Best & The Most Disappointing Movie Soundtracks of 2013'))

HeadlineProps(has_ents=True, is_emotional=False, has_superlativeness=False)
HeadlineProps(has_ents=True, is_emotional=False, has_superlativeness=False)
HeadlineProps(has_ents=False, is_emotional=False, has_superlativeness=False)
HeadlineProps(has_ents=False, is_emotional=True, has_superlativeness=True)


In [13]:
from tqdm import tqdm_notebook as tqdm

with open('examiner-headlines.txt', 'r') as inp:
    total = 0
    has_ents_num = 0
    is_emotional_num = 0
    has_superlativeness_num = 0
    for head in tqdm(inp, total=5000):
        head = head.strip()
        props = is_headline_viral(head)
        total += 1
        if props.has_ents:
            has_ents_num += 1
        if props.is_emotional:
            is_emotional_num += 1
        if props.has_superlativeness:
            has_superlativeness_num += 1
    print(f"Total: {total}, has entities: {has_ents_num}, is emotional: {is_emotional_num}, "+
          f"has superlativeness: {has_superlativeness_num}")

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))


Total: 5000, has entities: 2743, is emotional: 590, has superlativeness: 219
