# NLTK vs. spaCy: Introduction to basic NLP Processing

Learning goals:
 - Understand simple NLP pipline functionality for English in NLTK
 - See how a more modern and multilingual "industrial strength" framework works (SpaCy)

In [1]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/siclemat/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/siclemat/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!


True

In [2]:
text = """At eight o'clock on Thursday morning 
Arthur didn't feel very good."""

## Tokenization

In [3]:
tokens = nltk.word_tokenize(text)
print(tokens)

['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', 'Arthur', 'did', "n't", 'feel', 'very', 'good', '.']


## POS Tagging

In [4]:
tagged = nltk.pos_tag(tokens)
print(tagged)
for t in tagged:
    print(t)

[('At', 'IN'), ('eight', 'CD'), ("o'clock", 'NN'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN'), ('Arthur', 'NNP'), ('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')]
('At', 'IN')
('eight', 'CD')
("o'clock", 'NN')
('on', 'IN')
('Thursday', 'NNP')
('morning', 'NN')
('Arthur', 'NNP')
('did', 'VBD')
("n't", 'RB')
('feel', 'VB')
('very', 'RB')
('good', 'JJ')
('.', '.')


## Named Entity Recognition

In [5]:
entities = nltk.chunk.ne_chunk(tagged)
print(entities)


(S
  At/IN
  eight/CD
  o'clock/NN
  on/IN
  Thursday/NNP
  morning/NN
  (PERSON Arthur/NNP)
  did/VBD
  n't/RB
  feel/VB
  very/RB
  good/JJ
  ./.)


## Stemming
Create tab-delimited output

In [6]:
stemmer = nltk.stem.PorterStemmer()

In [7]:
for tok in nltk.word_tokenize('He believes in stemming.'):
    print(tok, stemmer.stem(tok), sep="\t")

He	he
believes	believ
in	in
stemming	stem
.	.


In [8]:
 lemmatizer = nltk.stem.WordNetLemmatizer()

In [9]:
for tok in nltk.word_tokenize('He believes in stemming.'):
    print(tok, lemmatizer.lemmatize(tok), sep="\t")

He	He
believes	belief
in	in
stemming	stemming
.	.


# Your turn... Look at how it is done in spacy
https://spacy.io contains descriptions of the small efficient model for English https://spacy.io/models/en#en_core_web_sm and the larger model https://spacy.io/models/en#en_core_web_trf. 

Do you notice a performance differenct?

In [None]:
# Only run this once (and maybe restart the kernel)
! pip install spacy
! python -m spacy download en_core_web_sm
# You can also test a more accurate transformer-based spaCy pipeline (450MB of data)
! python -m spacy download en_core_web_trf

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m46.8 MB/s[0m  [33m0:00:00[0m[31m26.3 MB/s[0m eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)


In [None]:
import spacy
modelname = "en_core_web_sm"
# modelname = "en_core_web_trf"
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load(modelname)


In [4]:
# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

In [5]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])



Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']


In [16]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(f"{entity.text:20} {entity.label_:8} # {spacy.explain(entity.label_)}")

Sebastian Thrun      PERSON   # People, including fictional
Google               ORG      # Companies, agencies, institutions, etc.
2007                 DATE     # Absolute or relative dates or periods
American             NORP     # Nationalities or religious or political groups
Thrun                PERSON   # People, including fictional
Recode               ORG      # Companies, agencies, institutions, etc.
earlier this week    DATE     # Absolute or relative dates or periods
