Spacy Basics


In [1]:
import spacy

In [2]:
# en_core_web_sm is a small English pipeline trained on written web text (blogs, news, comments), that includes vocabulary, syntax and entities.
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million.') # this is gonna parse the sentence each word into "token"

In [4]:
for token in doc :
  print(token.text, token.pos_)

Tesla NOUN
is AUX
looking VERB
at ADP
buying VERB
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM
. PUNCT


In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fba7f060b40>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fba7f060830>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fba7ed5c4d0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fba7ec99be0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fba7ec8b0f0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fba7ed48ed0>)]

In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
doc[0].pos_

'NOUN'

In [8]:
print(doc[0].text)

Tesla


In [10]:
print(doc[0].lemma_)

tesla


In [12]:
print(doc[0].shape_)

Xxxxx


In [13]:
doc2 = nlp(u"Hi. My name is Selen. Nice to meet you.")
for sentence in doc2.sents:
  print(sentence)

Hi.
My name is Selen.
Nice to meet you.


Tokenization

Tokenization is breaking the raw text into small chunks.

In [14]:
mystring = '" We\'re moving to L.A.! "'
print(mystring)

" We're moving to L.A.! "


In [15]:
doc = nlp(mystring)
for token in doc:
  print(token.text)

"
We
're
moving
to
L.A.
!
"


In [16]:
for token in doc:
  print(token.text, end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

In [23]:
doc2 = nlp(u"Apple to build Hong Kong factory for $6 million")
for entity in doc2.ents:
  print(entity)
  print(entity.label_)
  print(str(spacy.explain(entity.label_)))
  print('\n')

Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit


