## Tokenization

In [5]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('popular')
nltk.download('punkt_tab')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

In [6]:
tokens = word_tokenize("I'm learning NLP")
print(tokens)

['I', "'m", 'learning', 'NLP']


## Stemming

In [7]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["running", "ran", "swimming", "playing", "plays", "happily", "lazy"]

for word in words:
  print(word, "→", stemmer.stem(word) )

running → run
ran → ran
swimming → swim
playing → play
plays → play
happily → happili
lazy → lazi


## Lemmatization

In [12]:
# Using NLTK
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

words = ["running", "dancing", "happily", "lazy", "written","goes"]

for word in words:
  print(word, "→", lemmatizer.lemmatize(word, pos="v"))

running → run
dancing → dance
happily → happily
lazy → lazy
written → write
goes → go


In [17]:
#Using spaCy
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The weather in chennai is unusually romantic today.")

for token in doc:
  print(token.text, "→", token.lemma_)

The → the
weather → weather
in → in
chennai → chennai
is → be
unusually → unusually
romantic → romantic
today → today
. → .


## Stopword

In [19]:
doc = nlp("Let's see an example of a simple stopword removal code by using spacy")

filtered = []

for token in doc:
  if not token.is_stop:
    filtered.append(token)

print(filtered)


[Let, example, simple, stopword, removal, code, spacy]


## Part-of-Speech (POS) Tagging

In [26]:
# Using NLTK
nltk.download('averaged_perceptron_tagger_eng')
text = "The quick brown fox jumps over the lazy dog"
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)

print(pos_tags)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


In [23]:
# Using spaCy
doc = nlp("The quick brown fox jumps over the lazy dog")
for token in nlp:
  print(f" {token.text} → {token.pos} ({token.tag_})")

The → DET (DT)
quick → ADJ (JJ)
brown → ADJ (JJ)
fox → NOUN (NN)
jumps → VERB (VBZ)
over → ADP (IN)
the → DET (DT)
lazy → ADJ (JJ)
dog → NOUN (NN)
. → PUNCT (.)


## Named Entity Recognition (NER)

In [27]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "Tesla was founded by Elon Musk in California in 2003."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, "→", ent.label_)

Elon Musk → PERSON
California → GPE
2003 → DATE
