In [13]:
!pip install nltk



In [14]:
corpus = """
Hello! My name is Rohith. I’m learning NLP, Machine Learning, and Compiler Design.
Tokenisation helps computers understand text.
Let’s test: How many tokens are in this sentence?
"""

# Sentence Tokenization is being done. Corpus is being converted to sentences.

In [15]:
import nltk
from nltk.tokenize import sent_tokenize
documents = sent_tokenize(corpus)
for sentence in documents:
  print(sentence)


Hello!
My name is Rohith.
I’m learning NLP, Machine Learning, and Compiler Design.
Tokenisation helps computers understand text.
Let’s test: How many tokens are in this sentence?


# Word Tokenization is being done. Documents is being converted to words.

In [16]:
from nltk.tokenize import word_tokenize
for sentence in documents:
  print(word_tokenize(sentence))

['Hello', '!']
['My', 'name', 'is', 'Rohith', '.']
['I', '’', 'm', 'learning', 'NLP', ',', 'Machine', 'Learning', ',', 'and', 'Compiler', 'Design', '.']
['Tokenisation', 'helps', 'computers', 'understand', 'text', '.']
['Let', '’', 's', 'test', ':', 'How', 'many', 'tokens', 'are', 'in', 'this', 'sentence', '?']


In [17]:
from nltk.tokenize import wordpunct_tokenize
for sentence in documents:
  print(wordpunct_tokenize(sentence))

['Hello', '!']
['My', 'name', 'is', 'Rohith', '.']
['I', '’', 'm', 'learning', 'NLP', ',', 'Machine', 'Learning', ',', 'and', 'Compiler', 'Design', '.']
['Tokenisation', 'helps', 'computers', 'understand', 'text', '.']
['Let', '’', 's', 'test', ':', 'How', 'many', 'tokens', 'are', 'in', 'this', 'sentence', '?']


In [18]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['Hello',
 '!',
 'My',
 'name',
 'is',
 'Rohith.',
 'I’m',
 'learning',
 'NLP',
 ',',
 'Machine',
 'Learning',
 ',',
 'and',
 'Compiler',
 'Design.',
 'Tokenisation',
 'helps',
 'computers',
 'understand',
 'text.',
 'Let’s',
 'test',
 ':',
 'How',
 'many',
 'tokens',
 'are',
 'in',
 'this',
 'sentence',
 '?']

# Stemming is being done. PorterStemmer is being used.

In [19]:
words = [
    "connect", "connected", "connecting", "connection",
    "compute", "computer", "computing", "computation",
    "run", "running", "runner", "runs",
    "analysis", "analyzing", "analyzed", "analytical",
    "teach", "teacher", "teaching", "taught",
    "play", "playing", "played", "player",
    "happy", "happiness", "happily",
    "study", "studies", "studying", "studied"
]

In [20]:
from nltk.stem import PorterStemmer
stemmer  = PorterStemmer()
for word in words:
  print(word+"-->"+stemmer.stem(word))


connect-->connect
connected-->connect
connecting-->connect
connection-->connect
compute-->comput
computer-->comput
computing-->comput
computation-->comput
run-->run
running-->run
runner-->runner
runs-->run
analysis-->analysi
analyzing-->analyz
analyzed-->analyz
analytical-->analyt
teach-->teach
teacher-->teacher
teaching-->teach
taught-->taught
play-->play
playing-->play
played-->play
player-->player
happy-->happi
happiness-->happi
happily-->happili
study-->studi
studies-->studi
studying-->studi
studied-->studi


# Stemming is being done. RegexpStemmer is being used.

In [21]:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing$|s$|e$|able$|ed$', min=4)
for word in words:
  print(word+"->"+stemmer.stem(word))

connect->connect
connected->connect
connecting->connect
connection->connection
compute->comput
computer->computer
computing->comput
computation->computation
run->run
running->runn
runner->runner
runs->run
analysis->analysi
analyzing->analyz
analyzed->analyz
analytical->analytical
teach->teach
teacher->teacher
teaching->teach
taught->taught
play->play
playing->play
played->play
player->player
happy->happy
happiness->happines
happily->happily
study->study
studies->studie
studying->study
studied->studi


# SnowBallStemmer

In [22]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
for word in words:
  print(word+"->"+stemmer.stem(word))

connect->connect
connected->connect
connecting->connect
connection->connect
compute->comput
computer->comput
computing->comput
computation->comput
run->run
running->run
runner->runner
runs->run
analysis->analysi
analyzing->analyz
analyzed->analyz
analytical->analyt
teach->teach
teacher->teacher
teaching->teach
taught->taught
play->play
playing->play
played->play
player->player
happy->happi
happiness->happi
happily->happili
study->studi
studies->studi
studying->studi
studied->studi
