# NLTK Introduction to basic NLP Processing

Learning goals:
 - Understand simple NLP pipline functionality for English in NLTK
 - See how a more modern and multilingual "industrial strength" framework works (SpaCy)

In [None]:
import nltk

In [None]:
text = """At eight o'clock on Thursday morning 
Arthur didn't feel very good."""

## Tokenization

In [None]:
tokens = nltk.word_tokenize(text)
print(tokens)

## POS Tagging

In [None]:
tagged = nltk.pos_tag(tokens)
print(tagged)
for t in tagged:
    print(t)

## Named Entity Recognition

In [None]:
entities = nltk.chunk.ne_chunk(tagged)
print(entities)


## Stemming
Create tab-delimited output

In [None]:
stemmer = nltk.stem.PorterStemmer()

In [None]:
for tok in nltk.word_tokenize('He believes in stemming.'):
    print(tok, stemmer.stem(tok), sep="\t")

In [None]:
 lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
for tok in nltk.word_tokenize('He believes in stemming.'):
    print(tok, lemmatizer.lemmatize(tok), sep="\t")

# Your turn... Look a®t how it is done in spacy
https://spacy.io/

In [None]:
# Only run this once (and maybe restart the kernel)
! pip install spacy
! python -m spacy download en_core_web_sm

In [None]:
import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

In [None]:
# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

In [None]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])



In [None]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)