# Entities

In [19]:
import spacy
import random

from spacy import displacy
from itertools import chain
from tqdm import tqdm_notebook
from collections import Counter
from hyperdash import Experiment

## Spacy 101

In [2]:
SAMPLE_TEXT = '''But Google is starting from behind. The company made a late push \
into hardware, and Apple’s Siri, available on iPhones, and Amazon’s Alexa \
software, which runs on its Echo and Dot devices, have clear leads in \
consumer adoption.'''

In [3]:
nlp = spacy.load('en')
doc = nlp(SAMPLE_TEXT)

In [4]:
print([(token.text, token.lemma_, token.norm_, token.pos_, token.sentiment) for token in doc])

[('But', 'but', 'but', 'CCONJ', 0.0), ('Google', 'google', 'google', 'PROPN', 0.0), ('is', 'be', 'is', 'VERB', 0.0), ('starting', 'start', 'starting', 'VERB', 0.0), ('from', 'from', 'from', 'ADP', 0.0), ('behind', 'behind', 'behind', 'ADV', 0.0), ('.', '.', '.', 'PUNCT', 0.0), ('The', 'the', 'the', 'DET', 0.0), ('company', 'company', 'company', 'NOUN', 0.0), ('made', 'make', 'made', 'VERB', 0.0), ('a', 'a', 'gonna', 'DET', 0.0), ('late', 'late', 'late', 'ADJ', 0.0), ('push', 'push', 'push', 'NOUN', 0.0), ('into', 'into', 'into', 'ADP', 0.0), ('hardware', 'hardware', 'hardware', 'NOUN', 0.0), (',', ',', ',', 'PUNCT', 0.0), ('and', 'and', 'and', 'CCONJ', 0.0), ('Apple', 'apple', 'apple', 'PROPN', 0.0), ('’s', '’s', "'s", 'PART', 0.0), ('Siri', 'siri', 'siri', 'PROPN', 0.0), (',', ',', ',', 'PUNCT', 0.0), ('available', 'available', 'available', 'ADJ', 0.0), ('on', 'on', 'on', 'ADP', 0.0), ('iPhones', 'iphones', 'iphones', 'PROPN', 0.0), (',', ',', ',', 'PUNCT', 0.0), ('and', 'and', 'and',

In [5]:
print([chunk.text for chunk in doc.noun_chunks])

['Google', 'The company', 'a late push', 'hardware', 'Apple', 'Siri', 'iPhones', 'Amazon’s Alexa software', 'its Echo', 'Dot devices', 'clear leads', 'consumer adoption']


In [6]:
doc.user_data['title'] = 'Apple vs Google'
displacy.render(doc, style='ent', jupyter=True)

## Descs proceed

In [7]:
INFORMATIVE_POSES = {'ADJ', 'NOUN', 'PROPN'}
LOG_STEP = 5000

In [8]:
%store -r descs
len(descs)

171161

In [9]:
def fetch_entities(desc):
    def norm(entity):
        return entity.strip().lower()
    
    doc = nlp(desc)
    ents = (ent.text for ent in doc.ents)
    lemmas = (token.lemma_ for token in doc \
              if token.pos_ in INFORMATIVE_POSES and not token.lemma_.startswith('-'))
    noun_chunks = (chunk.text for chunk in doc.noun_chunks)
    return set(norm(entity) for entity in chain(ents, lemmas, noun_chunks))

In [None]:
def collect_entities(descs):
    exp = Experiment('1.3: Entities', capture_io=False)
    entities = []
    for i, desc in enumerate(tqdm_notebook(descs)):
        entities.append(fetch_entities(desc))
        if i % LOG_STEP == 0 or i == len(descs) - 1:
            exp.metric('step', i)
    exp.end()
    return entities

In [None]:
%time entities = collect_entities(descs)
assert len(descs) == len(entities)
len(entities)

| step:   0.000000 |
| step: 5000.000000 |
| step: 10000.000000 |
| step: 15000.000000 |
| step: 20000.000000 |
| step: 25000.000000 |
| step: 30000.000000 |
| step: 35000.000000 |
| step: 40000.000000 |
| step: 45000.000000 |
| step: 50000.000000 |
| step: 55000.000000 |
| step: 60000.000000 |
| step: 65000.000000 |
| step: 70000.000000 |
| step: 75000.000000 |
| step: 80000.000000 |
| step: 85000.000000 |
| step: 90000.000000 |
| step: 95000.000000 |
| step: 100000.000000 |
| step: 105000.000000 |
| step: 110000.000000 |
| step: 115000.000000 |
| step: 120000.000000 |
| step: 125000.000000 |
| step: 130000.000000 |
| step: 135000.000000 |
| step: 140000.000000 |
| step: 145000.000000 |


## Analysis

In [None]:
cgen = tqdm_notebook((Counter(entity) for entity in entities), total=len(descs))
%time ecnts = sum(cgen, Counter())
len(ecnts)

In [None]:
%store ecnts

## Save

In [None]:
%store entities