# Entities

In [1]:
import spacy

from spacy import displacy
from itertools import chain
from tqdm import tqdm_notebook

## Spacy 101

In [26]:
SAMPLE_TEXT = '''But Google is starting from behind. The company made a late push \
into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa \
software, which runs on its Echo and Dot devices, have clear leads in \
consumer adoption.'''

In [27]:
nlp = spacy.load('en')
doc = nlp(SAMPLE_TEXT)

In [47]:
print([(token.text, token.lemma_, token.norm_, token.pos_, token.sentiment) for token in doc])

[('But', 'but', 'but', 'CCONJ', 0.0), ('Google', 'google', 'google', 'PROPN', 0.0), ('is', 'be', 'is', 'VERB', 0.0), ('starting', 'start', 'starting', 'VERB', 0.0), ('from', 'from', 'from', 'ADP', 0.0), ('behind', 'behind', 'behind', 'ADV', 0.0), ('.', '.', '.', 'PUNCT', 0.0), ('The', 'the', 'the', 'DET', 0.0), ('company', 'company', 'company', 'NOUN', 0.0), ('made', 'make', 'made', 'VERB', 0.0), ('a', 'a', 'gonna', 'DET', 0.0), ('late', 'late', 'late', 'ADJ', 0.0), ('push', 'push', 'push', 'NOUN', 0.0), ('into', 'into', 'into', 'ADP', 0.0), ('hardware', 'hardware', 'hardware', 'NOUN', 0.0), (',', ',', ',', 'PUNCT', 0.0), ('and', 'and', 'and', 'CCONJ', 0.0), ('Apple', 'apple', 'apple', 'PROPN', 0.0), ('’s', '’s', "'s", 'PART', 0.0), ('Siri', 'siri', 'siri', 'PROPN', 0.0), (',', ',', ',', 'PUNCT', 0.0), ('available', 'available', 'available', 'ADJ', 0.0), ('on', 'on', 'on', 'ADP', 0.0), ('iPhones', 'iphones', 'iphones', 'PROPN', 0.0), (',', ',', ',', 'PUNCT', 0.0), ('and', 'and', 'and',

In [29]:
print([chunk.text for chunk in doc.noun_chunks])

['Google', 'The company', 'a late push', 'hardware', 'Apple', 'Siri', 'iPhones', 'Amazon’s Alexa software', 'its Echo', 'Dot devices', 'clear leads', 'consumer adoption']


In [30]:
doc.user_data['title'] = 'Apple vs Google'
displacy.render(doc, style='ent', jupyter=True)

## Descs proceed

In [60]:
INFORMATIVE_POSES = {'ADJ', 'NOUN', 'PROPN'}

In [76]:
%store -r descs
len(descs)

171161

In [74]:
def fetch_entities(desc):
    def norm(entity):
        return entity.strip().lower()
    
    doc = nlp(desc)
    ents = (ent.text for ent in doc.ents)
    lemmas = (token.lemma_ for token in doc \
              if token.pos_ in INFORMATIVE_POSES and not token.lemma_.startswith('-'))
    noun_chunks = (chunk.text for chunk in doc.noun_chunks)
    return set(norm(entity) for entity in chain(ents, lemmas, noun_chunks))

In [77]:
%time entitites = set(chain.from_iterable(fetch_entities(desc) for desc in tqdm_notebook(descs)))
len(entitites)

KeyboardInterrupt: 

NameError: name 'entitites' is not defined