In [1]:
import nltk
from pprint import pprint

## 1. Basic tokenize
- sent_tokenize
- word_tokenize
- wordpunct_tokenize

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize

sentences = "Hello. My name is Alice. I live in Seoul."
result = sent_tokenize(sentences)
pprint(result)

['Hello.', 'My name is Alice.', 'I live in Seoul.']


In [3]:
sentences = "Hello My name is Alice I live in Seoul"
result = sent_tokenize(sentences)
pprint(result)

['Hello My name is Alice I live in Seoul']


In [4]:
sentences = "Hello, My name is Alice. How are you? I live in Seoul."
result = sent_tokenize(sentences)
pprint(result)

['Hello, My name is Alice.', 'How are you?', 'I live in Seoul.']


In [7]:
sentences = "Hello. My name is Alice. I live in Seoul."
result = word_tokenize(sentences)
pprint(result)

['Hello',
 '.',
 'My',
 'name',
 'is',
 'Alice',
 '.',
 'I',
 'live',
 'in',
 'Seoul',
 '.']


In [8]:
sentences = """
All she talking bout is come and see me for once
Come and see me for once
You don't ever come to me, you don't ever come to me
All she ever say is come and see me for once
Come and see me for once
You don't ever come to me, you don't ever come to me
"""

word_tokenize_result = word_tokenize(sentences)
wordpunct_tokenize_result = wordpunct_tokenize(sentences)

print("==== Result of word_tokenize ====")
pprint(word_tokenize_result)
print("\n")
print("==== Result of wordpunct_tokenize ====")
pprint(wordpunct_tokenize_result)

==== Result of word_tokenize ====
['All',
 'she',
 'talking',
 'bout',
 'is',
 'come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'Come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'You',
 'do',
 "n't",
 'ever',
 'come',
 'to',
 'me',
 ',',
 'you',
 'do',
 "n't",
 'ever',
 'come',
 'to',
 'me',
 'All',
 'she',
 'ever',
 'say',
 'is',
 'come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'Come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'You',
 'do',
 "n't",
 'ever',
 'come',
 'to',
 'me',
 ',',
 'you',
 'do',
 "n't",
 'ever',
 'come',
 'to',
 'me']


==== Result of wordpunct_tokenize ====
['All',
 'she',
 'talking',
 'bout',
 'is',
 'come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'Come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'You',
 'don',
 "'",
 't',
 'ever',
 'come',
 'to',
 'me',
 ',',
 'you',
 'don',
 "'",
 't',
 'ever',
 'come',
 'to',
 'me',
 'All',
 'she',
 'ever',
 'say',
 'is',
 'come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'Come',
 'and',
 'see',
 'me',
 'for',
 'once',
 'You',

## 2. Part-of-speech (POS) tagging
- nltk.pos_tag

In [9]:
sentences = """
All she talking bout is come and see me for once
Come and see me for once
You don't ever come to me, you don't ever come to me
All she ever say is come and see me for once
Come and see me for once
You don't ever come to me, you don't ever come to me
"""

pos_result = nltk.pos_tag(nltk.tokenize.wordpunct_tokenize(sentences))
pprint(pos_result)

[('All', 'DT'),
 ('she', 'PRP'),
 ('talking', 'VBG'),
 ('bout', 'NN'),
 ('is', 'VBZ'),
 ('come', 'VBN'),
 ('and', 'CC'),
 ('see', 'VB'),
 ('me', 'PRP'),
 ('for', 'IN'),
 ('once', 'RB'),
 ('Come', 'NNP'),
 ('and', 'CC'),
 ('see', 'VB'),
 ('me', 'PRP'),
 ('for', 'IN'),
 ('once', 'RB'),
 ('You', 'PRP'),
 ('don', 'VBP'),
 ("'", "''"),
 ('t', 'JJ'),
 ('ever', 'RB'),
 ('come', 'VBP'),
 ('to', 'TO'),
 ('me', 'PRP'),
 (',', ','),
 ('you', 'PRP'),
 ('don', 'VBP'),
 ("'", "''"),
 ('t', 'JJ'),
 ('ever', 'RB'),
 ('come', 'VBP'),
 ('to', 'TO'),
 ('me', 'PRP'),
 ('All', 'PDT'),
 ('she', 'PRP'),
 ('ever', 'RB'),
 ('say', 'VBP'),
 ('is', 'VBZ'),
 ('come', 'JJ'),
 ('and', 'CC'),
 ('see', 'VB'),
 ('me', 'PRP'),
 ('for', 'IN'),
 ('once', 'RB'),
 ('Come', 'NNP'),
 ('and', 'CC'),
 ('see', 'VB'),
 ('me', 'PRP'),
 ('for', 'IN'),
 ('once', 'RB'),
 ('You', 'PRP'),
 ('don', 'VBP'),
 ("'", "''"),
 ('t', 'JJ'),
 ('ever', 'RB'),
 ('come', 'VBP'),
 ('to', 'TO'),
 ('me', 'PRP'),
 (',', ','),
 ('you', 'PRP'),
 ('don'

## 3. Normalization
- Stemming
- Lemmatization

In [12]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

text = list(nltk.word_tokenize("The women running in the fog passed bunnies working as computer scientists."))

snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
porter = PorterStemmer()

for stemmer in (snowball, lancaster, porter):
    stemmed_text = [stemmer.stem(t) for t in text]
    print(" ".join(stemmed_text))
    print(stemmed_text)

the women run in the fog pass bunni work as comput scientist .
['the', 'women', 'run', 'in', 'the', 'fog', 'pass', 'bunni', 'work', 'as', 'comput', 'scientist', '.']
the wom run in the fog pass bunny work as comput sci .
['the', 'wom', 'run', 'in', 'the', 'fog', 'pass', 'bunny', 'work', 'as', 'comput', 'sci', '.']
the women run in the fog pass bunni work as comput scientist .
['the', 'women', 'run', 'in', 'the', 'fog', 'pass', 'bunni', 'work', 'as', 'comput', 'scientist', '.']


In [13]:
from nltk.stem.wordnet import WordNetLemmatizer

# Note: use part of speech tag, we'll see this in machine learning! 
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(t) for t in text]
print(" ".join(lemmas))

The woman running in the fog passed bunny working a computer scientist .


In [34]:
import string
from nltk.corpus import wordnet as wn

## Module constants
lemmatizer  = WordNetLemmatizer()
stopwords   = set(nltk.corpus.stopwords.words('english'))
punctuation = string.punctuation

def tagwn(tag):
    """
    Returns the WordNet tag from the Penn Treebank tag.
    """

    return {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)


def normalize(text):
    for token, tag in nltk.pos_tag(nltk.wordpunct_tokenize(text)):
        #if you're going to do part of speech tagging, do it here
        token = token.lower()
        if token in stopwords or token in punctuation:
            continue
        token = lemmatizer.lemmatize(token, tagwn(tag))
        yield token

print(list(normalize("The eagle ! up a flies at midnight.")))

['eagle', 'fly', 'midnight']


In [33]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
print(stopwords)

{'up', 'an', 'won', 'been', 'each', 'y', 'by', 'those', 't', 'not', 'd', 'having', 'in', 'all', 'can', 'what', 'have', 'm', 'to', 'll', 'these', 'once', 'her', 'whom', 'wouldn', 'some', 'ain', 'only', 'themselves', 'was', 'if', 'through', 'isn', 'he', 'under', 'shan', 'itself', 'they', 'yourself', 'too', 'other', 'yourselves', 'hers', 'be', 'during', 'at', 'on', 'shouldn', 'our', 'doing', 'didn', 'so', 're', 'your', 'had', 'or', 'were', 'where', 'is', 'before', 'being', 'are', 'herself', 'she', 'will', 'should', 'but', 'doesn', 'o', 'a', 'needn', 'about', 'the', 'i', 'theirs', 'you', 'again', 'below', 'above', 'did', 'his', 'its', 'their', 'as', 'same', 'weren', 'off', 'we', 'into', 'just', 'ma', 'for', 'there', 'don', 'this', 'any', 'wasn', 'them', 'am', 'hasn', 'both', 'ours', 'me', 'then', 'yours', 'that', 'further', 'who', 'couldn', 'why', 'has', 've', 'over', 'himself', 'no', 'between', 'very', 'until', 'which', 'nor', 'with', 'against', 'such', 'after', 'haven', 'most', 'ourselve

## 4. Named-entity recognition (NER)
- Maximum entropy based NER
- Stanford NER packages

In [19]:
text = "LG electronics released the smart phone 'G6' in April, 2017."
print(nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))))

(S
  LG/NNP
  electronics/NNS
  released/VBD
  the/DT
  smart/JJ
  phone/NN
  'G6/POS
  '/''
  in/IN
  (GPE April/NNP)
  ,/,
  2017/CD
  ./.)


Download Stanford NER packages: https://nlp.stanford.edu/software/CRF-NER.shtml#Download

In [7]:
from nltk.tag import StanfordNERTagger

stanford_data = 'stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz'
stanford_jar =  'stanford-ner-2016-10-31/stanford-ner-3.7.0.jar'

text = "Samsung electronics Microsoft research GE LG Baidu Amazon"
st = StanfordNERTagger(stanford_data, stanford_jar, 'utf-8')
for i in st.tag(text.split()):
    print('[' + i[1] + '] ' + i[0])

[ORGANIZATION] Samsung
[O] electronics
[ORGANIZATION] Microsoft
[O] research
[ORGANIZATION] GE
[ORGANIZATION] LG
[ORGANIZATION] Baidu
[ORGANIZATION] Amazon
[O] machintosh.


## 5. Parsing
- Parsing using a grammar
- StanfordParser

In [25]:
grammar = nltk.grammar.CFG.fromstring("""

S -> NP PUNCT | NP
NP -> N N | ADJP NP | DET N | DET ADJP
ADJP -> ADJ NP | ADJ N

DET -> 'an' | 'the' | 'a' | 'that'
N -> 'airplane' | 'runway' | 'lawn' | 'chair' | 'person' 
ADJ -> 'red' | 'slow' | 'tired' | 'long'
PUNCT -> '.'
""")

In [27]:
def parse(sent):
    sent = sent.lower()
    parser = nltk.parse.ChartParser(grammar)
    for p in parser.parse(nltk.word_tokenize(sent)):
        yield p 

        
for tree in parse("the long runway"): 
    tree.pprint()
#     tree[0].draw()

(S (NP (DET the) (ADJP (ADJ long) (N runway))))


Download Stanford Parser packages: https://nlp.stanford.edu/software/lex-parser.shtml#Download

In [None]:
from nltk.parse.stanford import StanfordParser

stanford_model = 'stanford-parser-full-2016-10-31/stanford-parser-3.7.0-models.jar'
stanford_jar = 'stanford-parser-full-2016-10-31/stanford-parser.jar'

st = StanfordParser(stanford_model, stanford_jar)
sent = "The man hit the building with the baseball bat."
for tree in st.parse(nltk.wordpunct_tokenize(sent)):
    tree.pprint()
#     tree.draw()