# Example 1 - Stemming

In [31]:
import nltk
#nltk.download()
from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer, WordNetLemmatizer

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [12]:
words = ['going','went','thinking','visited','houses','elevators', 'thought']


rgst = RegexpStemmer(r'(^th)|(ed$)')
pst = PorterStemmer()
lst = LancasterStemmer()
wnlt = WordNetLemmatizer()

for word in words:
    print("---"+word+"---")
    print("Regex="+rgst.stem(word))
    print("Porter="+pst.stem(word))
    print("Lancaster="+lst.stem(word)) 
    print("WordNet="+wnlt.lemmatize(word, pos='v'))
    print("")

---going---
Regex=going
Porter=go
Lancaster=going
WordNet=go

---went---
Regex=went
Porter=went
Lancaster=went
WordNet=go

---thinking---
Regex=inking
Porter=think
Lancaster=think
WordNet=think

---visited---
Regex=visit
Porter=visit
Lancaster=visit
WordNet=visit

---houses---
Regex=houses
Porter=hous
Lancaster=hous
WordNet=house

---elevators---
Regex=elevators
Porter=elev
Lancaster=elev
WordNet=elevators

---thought---
Regex=ought
Porter=thought
Lancaster=thought
WordNet=think



# Example 2 - Tokenizers

In [17]:
from nltk.tokenize import word_tokenize, WordPunctTokenizer, WhitespaceTokenizer, MWETokenizer
#nltk.download('punkt')

In [18]:
wpt = WordPunctTokenizer()
wst = WhitespaceTokenizer()
mwet = MWETokenizer()

In [19]:
def print_words(text, wl=10):
    print("Word Tok. %d words" % len(word_tokenize(text)))
    print(word_tokenize(text)[:wl])
    print("")

def print_words_wpt(text, wl=10):
    print("W. Punct. Tok. %d words" % len(wpt.tokenize(text)))
    print(wpt.tokenize(text)[:wl])
    print("")

def print_words_wst(text, wl=10):
    print("Whitespace Tok. %d words" % len(wst.tokenize(text)))
    print(wst.tokenize(text)[:wl])
    print("")

def print_words_mwet(text, wl=10):
    print("Multiword Tok %d words" % len(mwet.tokenize(text.split())))
    print(mwet.tokenize(text.split())[:wl])
    print("")

lorem = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit  esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
rob = "My name is Mr. Robinson and I hold a Msc. in Computer Science at Stanford University in USA. Nice to meet you... what was your name?"

mwet.add_mwe(('Computer', 'Science'))
#mwet.add_mwe(('Stanford', 'University'))


print("---- Lorem ---")

print_words(lorem)
print_words_wpt(lorem)
print_words_wst(lorem)
print_words_mwet(lorem)

print("---- Mr Robinson ---")

print_words(rob)
print_words_wpt(rob)
print_words_wst(rob, wl=20)
print_words_mwet(rob, wl=20)

---- Lorem ---
Word Tok. 77 words
['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', ',']

W. Punct. Tok. 77 words
['Lorem', 'ipsum', 'dolor', 'sit', 'amet', ',', 'consectetur', 'adipiscing', 'elit', ',']

Whitespace Tok. 69 words
['Lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit,', 'sed', 'do']

Multiword Tok 69 words
['Lorem', 'ipsum', 'dolor', 'sit', 'amet,', 'consectetur', 'adipiscing', 'elit,', 'sed', 'do']

---- Mr Robinson ---
Word Tok. 30 words
['My', 'name', 'is', 'Mr.', 'Robinson', 'and', 'I', 'hold', 'a', 'Msc']

W. Punct. Tok. 31 words
['My', 'name', 'is', 'Mr', '.', 'Robinson', 'and', 'I', 'hold', 'a']

Whitespace Tok. 26 words
['My', 'name', 'is', 'Mr.', 'Robinson', 'and', 'I', 'hold', 'a', 'Msc.', 'in', 'Computer', 'Science', 'at', 'Stanford', 'University', 'in', 'USA.', 'Nice', 'to']

Multiword Tok 25 words
['My', 'name', 'is', 'Mr.', 'Robinson', 'and', 'I', 'hold', 'a', 'Msc.', 'in', 'Computer_Science', 'at

# Example 3 - Sentence Tokenizer

In [20]:
from nltk import sent_tokenize
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
import nltk.data

In [21]:
# load spanish tokenizer from pickle
spanish_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')

ppar = PunktParameters()
abbs = ['msc']
ppar.abbrev_types = set(abbs)
pst = PunktSentenceTokenizer(ppar)

In [22]:
def print_sent(text):
    print ("-- Sentence Tok. %d sentences" % len(sent_tokenize(text)))
    for s in sent_tokenize(text):
        print("> %s\n" % s)

def print_sent_es(text):
    print ("-- Sent. Spanish Tok. %d sentences" % len(spanish_tokenizer.tokenize(text)))
    for s in spanish_tokenizer.tokenize(text):
        print("> %s\n" % s)


def print_sent_pst(text):
    print ("-- Punkt Sentence Tok. %d sentences" % len(pst.tokenize(text)))
    for s in pst.tokenize(text):
        print("> %s\n" % s)


lorem = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit  esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
rob = "My name is Mr. Robinson and I hold a MSc. in Computer Science at Stanford University in USA. Nice to meet you... what was your name?"
hola = u"¡Hola a todos, bienvenidos a NLP! Mi nombre es Pedro. ¿Que os parece el procesamiento de lenguaje natural?"

print_sent(lorem)
print_sent(rob)
print_sent_pst(rob)
print_sent_es(hola)

-- Sentence Tok. 4 sentences
> Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

> Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.

> Duis aute irure dolor in reprehenderit in voluptate velit  esse cillum dolore eu fugiat nulla pariatur.

> Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

-- Sentence Tok. 3 sentences
> My name is Mr. Robinson and I hold a MSc.

> in Computer Science at Stanford University in USA.

> Nice to meet you... what was your name?

-- Punkt Sentence Tok. 3 sentences
> My name is Mr.

> Robinson and I hold a MSc. in Computer Science at Stanford University in USA.

> Nice to meet you... what was your name?

-- Sent. Spanish Tok. 3 sentences
> ¡Hola a todos, bienvenidos a NLP!

> Mi nombre es Pedro.

> ¿Que os parece el procesamiento de lenguaje natural?



In [30]:
words = ['going','went','thinking','visited','houses','elevators', 'thought']


rgst = RegexpStemmer(r'(^th)|(ed$)')
pst = PorterStemmer()
lst = LancasterStemmer()
wnlt = WordNetLemmatizer()

for word in words:
    print("---"+word+"---")
#     print("Regex="+rgst.lemmatize(word))
#     print("Porter="+pst.stem(word))
#     print("Lancaster="+lst.stem(word)) 
    print("WordNet="+wnlt.lemmatize(word, pos='v'))
    print("")

---going---
WordNet=go

---went---
WordNet=go

---thinking---
WordNet=think

---visited---
WordNet=visit

---houses---
WordNet=house

---elevators---
WordNet=elevators

---thought---
WordNet=think

