Spacy Basics


In [None]:
import spacy

In [None]:
# en_core_web_sm is a small English pipeline trained on written web text (blogs, news, comments), that includes vocabulary, syntax and entities.
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million.') # this is gonna parse the sentence each word into "token"

In [None]:
for token in doc :
  print(token.text, token.pos_)

Tesla NOUN
is AUX
looking VERB
at ADP
buying VERB
U.S. PROPN
startup NOUN
for ADP
$ SYM
6 NUM
million NUM
. PUNCT


In [None]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fba7f060b40>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fba7f060830>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fba7ed5c4d0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fba7ec99be0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fba7ec8b0f0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fba7ed48ed0>)]

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
doc[0].pos_

'NOUN'

In [None]:
print(doc[0].text)

Tesla


In [None]:
print(doc[0].lemma_)

tesla


In [None]:
print(doc[0].shape_)

Xxxxx


In [None]:
doc2 = nlp(u"Hi. My name is Selen. Nice to meet you.")
for sentence in doc2.sents:
  print(sentence)

Hi.
My name is Selen.
Nice to meet you.


Tokenization

Tokenization is breaking the raw text into small chunks.

In [None]:
mystring = '" We\'re moving to L.A.! "'
print(mystring)

" We're moving to L.A.! "


In [None]:
doc = nlp(mystring)
for token in doc:
  print(token.text)

"
We
're
moving
to
L.A.
!
"


In [None]:
for token in doc:
  print(token.text, end=' | ')

" | We | 're | moving | to | L.A. | ! | " | 

In [None]:
doc2 = nlp(u"Apple to build Hong Kong factory for $6 million")
for entity in doc2.ents:
  print(entity)
  print(entity.label_)
  print(str(spacy.explain(entity.label_)))
  print('\n')

Hong Kong
GPE
Countries, cities, states


$6 million
MONEY
Monetary values, including unit




In [24]:
# displacy is a built in visualizer
from spacy import displacy 

In [25]:
doc = nlp(u"Apple is going to build a U.K. factory for $6 million")

In [27]:
# there are many types of style for more -> https://spacy.io/usage/visualizers
displacy.render(doc, style='dep',jupyter=True, options={'distance': 110})

In [28]:
displacy.render(doc, style='ent',jupyter=True, options={'distance': 110})

Stemming

Stemming is the process of removing a part of a word, or reducing a word to its stem or root.

In [29]:
import nltk
from nltk.stem.porter import PorterStemmer 

In [30]:
p_stemmer = PorterStemmer()

In [39]:
words = ['runner', 'ran', 'runs', 'easily', 'run', 'fairly', 'fairness']

In [40]:
for word in words:
  print(word + ' ------> ' + p_stemmer.stem(word))

runner ------> runner
ran ------> ran
runs ------> run
easily ------> easili
run ------> run
fairly ------> fairli
fairness ------> fair


In [41]:
from nltk.stem.snowball import SnowballStemmer

In [42]:
s_stemmer = SnowballStemmer(language='english')

In [43]:
for word in words:
  print(word + ' ------> ' + s_stemmer.stem(word))

runner ------> runner
ran ------> ran
runs ------> run
easily ------> easili
run ------> run
fairly ------> fair
fairness ------> fair
