<a href="https://colab.research.google.com/github/rohith7612/Generative-AI/blob/main/Tokeization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk



In [None]:
corpus = """
Hello! My name is Rohith. I’m learning NLP, Machine Learning, and Compiler Design.
Tokenisation helps computers understand text.
Let’s test: How many tokens are in this sentence?
"""

# Sentence Tokenization is being done. Corpus is being converted to sentences.

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
documents = sent_tokenize(corpus)
for sentence in documents:
  print(sentence)


Hello!
My name is Rohith.
I’m learning NLP, Machine Learning, and Compiler Design.
Tokenisation helps computers understand text.
Let’s test: How many tokens are in this sentence?


# Word Tokenization is being done. Documents is being converted to words.

In [None]:
from nltk.tokenize import word_tokenize
for sentence in documents:
  print(word_tokenize(sentence))

['Hello', '!']
['My', 'name', 'is', 'Rohith', '.']
['I', '’', 'm', 'learning', 'NLP', ',', 'Machine', 'Learning', ',', 'and', 'Compiler', 'Design', '.']
['Tokenisation', 'helps', 'computers', 'understand', 'text', '.']
['Let', '’', 's', 'test', ':', 'How', 'many', 'tokens', 'are', 'in', 'this', 'sentence', '?']


In [None]:
from nltk.tokenize import wordpunct_tokenize
for sentence in documents:
  print(wordpunct_tokenize(sentence))

['Hello', '!']
['My', 'name', 'is', 'Rohith', '.']
['I', '’', 'm', 'learning', 'NLP', ',', 'Machine', 'Learning', ',', 'and', 'Compiler', 'Design', '.']
['Tokenisation', 'helps', 'computers', 'understand', 'text', '.']
['Let', '’', 's', 'test', ':', 'How', 'many', 'tokens', 'are', 'in', 'this', 'sentence', '?']


In [None]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['Hello',
 '!',
 'My',
 'name',
 'is',
 'Rohith.',
 'I’m',
 'learning',
 'NLP',
 ',',
 'Machine',
 'Learning',
 ',',
 'and',
 'Compiler',
 'Design.',
 'Tokenisation',
 'helps',
 'computers',
 'understand',
 'text.',
 'Let’s',
 'test',
 ':',
 'How',
 'many',
 'tokens',
 'are',
 'in',
 'this',
 'sentence',
 '?']

# Stemming is being done. PorterStemmer is being used.

In [None]:
words = [
    "connect", "connected", "connecting", "connection",
    "compute", "computer", "computing", "computation",
    "run", "running", "runner", "runs",
    "analysis", "analyzing", "analyzed", "analytical",
    "teach", "teacher", "teaching", "taught",
    "play", "playing", "played", "player",
    "happy", "happiness", "happily",
    "study", "studies", "studying", "studied"
]

In [None]:
from nltk.stem import PorterStemmer
stemmer  = PorterStemmer()
for word in words:
  print(word+"-->"+stemmer.stem(word))


connect-->connect
connected-->connect
connecting-->connect
connection-->connect
compute-->comput
computer-->comput
computing-->comput
computation-->comput
run-->run
running-->run
runner-->runner
runs-->run
analysis-->analysi
analyzing-->analyz
analyzed-->analyz
analytical-->analyt
teach-->teach
teacher-->teacher
teaching-->teach
taught-->taught
play-->play
playing-->play
played-->play
player-->player
happy-->happi
happiness-->happi
happily-->happili
study-->studi
studies-->studi
studying-->studi
studied-->studi


# Stemming is being done. RegexpStemmer is being used.

In [None]:
from nltk.stem import RegexpStemmer
stemmer = RegexpStemmer('ing$|s$|e$|able$|ed$', min=4)
for word in words:
  print(word+"->"+stemmer.stem(word))

connect->connect
connected->connect
connecting->connect
connection->connection
compute->comput
computer->computer
computing->comput
computation->computation
run->run
running->runn
runner->runner
runs->run
analysis->analysi
analyzing->analyz
analyzed->analyz
analytical->analytical
teach->teach
teacher->teacher
teaching->teach
taught->taught
play->play
playing->play
played->play
player->player
happy->happy
happiness->happines
happily->happily
study->study
studies->studie
studying->study
studied->studi


# SnowBallStemmer

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")
for word in words:
  print(word+"->"+stemmer.stem(word))

connect->connect
connected->connect
connecting->connect
connection->connect
compute->comput
computer->comput
computing->comput
computation->comput
run->run
running->run
runner->runner
runs->run
analysis->analysi
analyzing->analyz
analyzed->analyz
analytical->analyt
teach->teach
teacher->teacher
teaching->teach
taught->taught
play->play
playing->play
played->play
player->player
happy->happi
happiness->happi
happily->happili
study->studi
studies->studi
studying->studi
studied->studi


# Lemmatization. Uses parts of speech to convert into root form.

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
for word in words:
  print(word+"->"+lemmatizer.lemmatize(word,pos="v"))

[nltk_data] Downloading package wordnet to /root/nltk_data...


connect->connect
connected->connect
connecting->connect
connection->connection
compute->compute
computer->computer
computing->compute
computation->computation
run->run
running->run
runner->runner
runs->run
analysis->analysis
analyzing->analyze
analyzed->analyze
analytical->analytical
teach->teach
teacher->teacher
teaching->teach
taught->teach
play->play
playing->play
played->play
player->player
happy->happy
happiness->happiness
happily->happily
study->study
studies->study
studying->study
studied->study


# Stop Words and its process

In [None]:
speech = """My dear young friends, I stand before you today with great hope and confidence in the power of youth.
Each one of you has a unique role to play in shaping the destiny of our nation, and this responsibility
demands hard work, integrity, and continuous learning. Dreams are not mere thoughts that appear during
sleep, but powerful visions that inspire you to think beyond limitations and act with courage.
When you set a clear goal and work with discipline and determination, challenges will transform into
opportunities for growth. Education is not just about acquiring knowledge, but about igniting curiosity,
building character, and developing the ability to serve society. If the youth of India commit themselves
to innovation, ethical leadership, and national development, our country will surely emerge as a
knowledge-driven and self-reliant nation."""

In [None]:
import nltk
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords
documents = sent_tokenize(speech)
for i in range(len(documents)):
  words = word_tokenize(documents[i])
  words = [lemmatizer.lemmatize(word.lower(),pos = "v") for word in words if word not in stopwords.words('english')]
  documents[i] = ' '.join(words)
print(documents)

['my dear young friends , i stand today great hope confidence power youth .', 'each one unique role play shape destiny nation , responsibility demand hard work , integrity , continuous learn .', 'dream mere thoughts appear sleep , powerful visions inspire think beyond limitations act courage .', 'when set clear goal work discipline determination , challenge transform opportunities growth .', 'education acquire knowledge , ignite curiosity , build character , develop ability serve society .', 'if youth india commit innovation , ethical leadership , national development , country surely emerge knowledge-driven self-reliant nation .']
