In [1]:
# Author: Roi Yehoshua <roiyeho@gmail.com>
# May 2024
# License: MIT

In [2]:
import nltk

Tokenization

In [3]:
text = "Apple is looking at buying a U.K. startup for $1 billion"
tokens = nltk.word_tokenize(text)
print(tokens)

['Apple', 'is', 'looking', 'at', 'buying', 'a', 'U.K.', 'startup', 'for', '$', '1', 'billion']


In [4]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
print(tokens)

['Apple', 'is', 'looking', 'at', 'buying', 'a', 'U', 'K', 'startup', 'for', '1', 'billion']


Stop words

In [5]:
from nltk.corpus import stopwords

tokens = nltk.word_tokenize(text)
stop_words = stopwords.words('english')
filtered_tokens = [token for token in tokens if token not in stop_words]
filtered_tokens

['Apple', 'looking', 'buying', 'U.K.', 'startup', '$', '1', 'billion']

Stemming

In [6]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
tokens = nltk.word_tokenize(text)
stems = [ps.stem(token) for token in tokens]
print(stems)

['appl', 'is', 'look', 'at', 'buy', 'a', 'u.k.', 'startup', 'for', '$', '1', 'billion']


Lemmatization

In [7]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()
tokens = nltk.word_tokenize("The mice are eating the apple")
lemmas = [wnl.lemmatize(token) for token in tokens]
print(lemmas)

['The', 'mouse', 'are', 'eating', 'the', 'apple']


N-grams

In [8]:
from nltk.util import trigrams

tokens = nltk.word_tokenize(text)
list(trigrams(tokens))

[('Apple', 'is', 'looking'),
 ('is', 'looking', 'at'),
 ('looking', 'at', 'buying'),
 ('at', 'buying', 'a'),
 ('buying', 'a', 'U.K.'),
 ('a', 'U.K.', 'startup'),
 ('U.K.', 'startup', 'for'),
 ('startup', 'for', '$'),
 ('for', '$', '1'),
 ('$', '1', 'billion')]

POS Tagging

In [9]:
tokens = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokens)
print(tagged)

[('Apple', 'NNP'), ('is', 'VBZ'), ('looking', 'VBG'), ('at', 'IN'), ('buying', 'VBG'), ('a', 'DT'), ('U.K.', 'NNP'), ('startup', 'NN'), ('for', 'IN'), ('$', '$'), ('1', 'CD'), ('billion', 'CD')]


NER

In [10]:
tokens = nltk.word_tokenize("Former US president Donald Trump holds rally in Indianola, Iowa")
tagged = nltk.pos_tag(tokens)
entities = nltk.chunk.ne_chunk(tagged)
print(entities)

(S
  Former/JJ
  (GPE US/NNP)
  president/NN
  (PERSON Donald/NNP Trump/NNP)
  holds/VBZ
  rally/RB
  in/IN
  (GPE Indianola/NNP)
  ,/,
  (GPE Iowa/NNP))
