# Natural Language Processing

## Spacy Basics

In [2]:
# Import

import spacy # import the Spacy library

In [3]:
# Load language library

nlp = spacy.load('en_core_web_sm')

In [6]:
# Create a document object

doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

# NOTES:
# u   - is for unicode string

In [9]:
for token in doc:

  print(token.text, token.pos_, token.dep_) # Raw text

# NOTES:

# token.pos     - part of speech, numbers corresponds to part of speech (noun, adverb, verb, conjugation, etc.)
# token.pos_    - raw names
# token.dep_    - syntactic dependencies

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


In [10]:
# Pipeline Object

nlp.pipeline

# NOTES:
# ner   - name entity recognizer

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7ff301f85960>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7ff301f85de0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7ff398be3300>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7ff301f4c9c0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7ff301f7e680>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7ff398b7ed50>)]

In [13]:
# Tokenization
# The process of splitting a text into smaller, manageable units

# The tokens can represent words, subwords, characters, or sentences,
# depending on the level of granularity required for the specific learning NLP task.

doc2 = nlp(u"Tesla isn't looking into startups anymore.")

for token in doc2:

  print(token.text, token.pos_, token.dep_)

# Notice how `isn't` has been split into two tokens. spaCy recognizes both the root verb `is`
# and the negation attached to it. Notice also that both the extended whitespace and the
# period at the end of the sentence are assigned their own tokens.

Tesla PROPN nsubj
is AUX aux
n't PART neg
looking VERB ROOT
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [14]:
# Indexing

doc2[0].pos_

'PROPN'

In [15]:
# Indexing

doc2[1].dep_

'aux'

In [16]:
# Spans

doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')

In [17]:
# Indexing (Spans)

life_quote = doc3[16:30]
print(life_quote)

"Life is what happens to us while we are making other plans"


In [18]:
# Type

type(life_quote)

spacy.tokens.span.Span

In [19]:
# Type

type(doc3)

spacy.tokens.doc.Doc

In [20]:
# Sentences

doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

for sentence in doc4.sents:

  print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [26]:
# Indexing

doc4[6]

This

In [23]:
# Indexing

doc4[7].is_sent_start # Is the start of a sentence?

False