In [19]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
text = "I'm learning Natural Language Processing, and I love Machine Learning!"

**White Space Tokenizer**

In [3]:
from nltk.tokenize import WhitespaceTokenizer
WhitespaceTokenizer().tokenize(text)

["I'm",
 'learning',
 'Natural',
 'Language',
 'Processing,',
 'and',
 'I',
 'love',
 'Machine',
 'Learning!']

**Word Punctuation based**

In [4]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(text)

['I',
 "'",
 'm',
 'learning',
 'Natural',
 'Language',
 'Processing',
 ',',
 'and',
 'I',
 'love',
 'Machine',
 'Learning',
 '!']

**Treebank**

In [5]:
from nltk.tokenize import TreebankWordTokenizer
TreebankWordTokenizer().tokenize(text)

['I',
 "'m",
 'learning',
 'Natural',
 'Language',
 'Processing',
 ',',
 'and',
 'I',
 'love',
 'Machine',
 'Learning',
 '!']

**Tweet**

In [7]:
from nltk.tokenize import TweetTokenizer
TweetTokenizer().tokenize(text)

["I'm",
 'learning',
 'Natural',
 'Language',
 'Processing',
 ',',
 'and',
 'I',
 'love',
 'Machine',
 'Learning',
 '!']

**MWE: Multi Word Expression**

In [8]:
from nltk.tokenize import MWETokenizer

mwe = MWETokenizer([('machine', 'learning'), ('natural', 'language', 'processing')], separator = '_')
mwe.tokenize(text.lower().split())

["i'm",
 'learning',
 'natural',
 'language',
 'processing,',
 'and',
 'i',
 'love',
 'machine',
 'learning!']

**Stemmer**
Porter Stemmer

In [9]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
[ps.stem(w) for w in TreebankWordTokenizer().tokenize(text)]

['i',
 "'m",
 'learn',
 'natur',
 'languag',
 'process',
 ',',
 'and',
 'i',
 'love',
 'machin',
 'learn',
 '!']

Snowball Stemmer

In [10]:
from nltk.stem import SnowballStemmer
ss = SnowballStemmer('english')
[ss.stem(w) for w in TreebankWordTokenizer().tokenize(text)]

['i',
 "'m",
 'learn',
 'natur',
 'languag',
 'process',
 ',',
 'and',
 'i',
 'love',
 'machin',
 'learn',
 '!']

**Lemmatization**
Lazy

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(w) for w in TreebankWordTokenizer().tokenize(text)]

['I',
 "'m",
 'learning',
 'Natural',
 'Language',
 'Processing',
 ',',
 'and',
 'I',
 'love',
 'Machine',
 'Learning',
 '!']

POS based

In [13]:
from nltk import pos_tag
from nltk.corpus import wordnet

In [31]:
import nltk
nltk.download('averaged_perceptron_tagger_eng', download_dir='/root/nltk_data')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [32]:
import nltk
nltk.data.path.append('/root/nltk_data')

In [33]:
def get_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN

In [34]:
tokens = TreebankWordTokenizer().tokenize(text)
pos_tags = pos_tag(tokens)

[lemmatizer.lemmatize(w, get_pos(t)) for w, t in pos_tags]


['I',
 "'m",
 'learn',
 'Natural',
 'Language',
 'Processing',
 ',',
 'and',
 'I',
 'love',
 'Machine',
 'Learning',
 '!']