# SPACY TEST LAB

### 1. Install Spacy

In [2]:
#!pip install spacy

In [5]:
#!python -m spacy download en

### 2. Import spacy and download english package

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

### 3. Tokens, entities, chunks

In [40]:
mystring = 'Apple to build a Hong Kong factory for $6 million related to paid invoice 324546786 from 12/12/2019.'

In [41]:
doc = nlp(mystring)

In [42]:
for token in doc:
    print(token.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | related | to | paid | invoice | 324546786 | from | 12/12/2019 | . | 

In [44]:
for entity in doc.ents:
    print(entity, entity.label_)
    print(str(spacy.explain(entity.label_)))

Apple ORG
Companies, agencies, institutions, etc.
Hong Kong GPE
Countries, cities, states
$6 million MONEY
Monetary values, including unit
324546786 CARDINAL
Numerals that do not fall under another type
12/12/2019 DATE
Absolute or relative dates or periods


In [45]:
for chunk in doc.noun_chunks:
    print(chunk)

Apple
a Hong Kong factory
paid invoice


### 4. Displacy

In [4]:
from spacy import displacy

In [50]:
displacy.render(doc, style='dep', jupyter=True, options={'distance':110}) # dep = syntactic dependency

In [58]:
displacy.render(doc, style='ent', jupyter=True) # ent = entity

### 5. Lemmatization and stop words

In [64]:
doc1 = nlp(u'I am a runner running in a race because I love to run since I ran today.')
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')
show_lemmas(doc1)


I            PRON   561228191312463089     -PRON-
am           AUX    10382539506755952630   be
a            DET    11901859001352538922   a
runner       NOUN   12640964157389618806   runner
running      VERB   12767647472892411841   run
in           ADP    3002984154512732771    in
a            DET    11901859001352538922   a
race         NOUN   8048469955494714898    race
because      SCONJ  16950148841647037698   because
I            PRON   561228191312463089     -PRON-
love         VERB   3702023516439754181    love
to           PART   3791531372978436496    to
run          VERB   12767647472892411841   run
since        SCONJ  10066841407251338481   since
I            PRON   561228191312463089     -PRON-
ran          VERB   12767647472892411841   run
today        NOUN   11042482332948150395   today
.            PUNCT  12646065887601541794   .


In [69]:
print(nlp.Defaults.stop_words)

{'done', 'one', 'due', 'below', 'another', 'same', 'many', 'besides', '’ve', 'hereby', 'somewhere', 'hers', 'all', 'while', 'mostly', 'very', 'amongst', 'not', 'out', 'wherever', 'least', 'nobody', 'latter', 'either', 'until', 'using', 'ever', 'off', 'forty', 'almost', 'around', 'where', 'into', 'anyway', 'yourself', 'n’t', 'afterwards', "'s", 'are', 'therein', "'ll", 'being', 'of', 'third', 'itself', 'in', 'fifty', 'there', 'your', 'and', 'might', 'here', 'hundred', 'himself', 'ourselves', 'never', 'above', 'mine', 'more', 'sometimes', 'throughout', 'whole', 'now', 'am', 'thereupon', 'moreover', 'sometime', 'made', "n't", 'upon', 'sixty', 'why', 'nine', "'ve", 'thru', 'each', 'up', 'various', '‘d', 'give', 'this', 'make', 'who', 'somehow', 'quite', 'yourselves', 'else', 'enough', 'empty', 'us', 'keep', 'our', 'towards', 'yours', 'among', 'a', 'become', 'under', 'thus', 'some', 'ours', 'him', 'myself', 'really', 'anywhere', '‘m', 'whenever', 'those', 'six', 'his', 'so', 'go', 'however'

### 6. Phrase matching and vocabulary 

In [5]:
with open('TextFiles/reaganomics.txt') as f:
    docr = nlp(f.read())
displacy.render(docr, style='ent', jupyter=True) # ent = entity