<a href="https://colab.research.google.com/github/ritiksharmasde/PROJECTS/blob/main/tokenization_in_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Task 1: Sentence Tokenization

In [4]:
# --- Install spaCy model in Colab ---
!pip install -q spacy
!python -m spacy download en_core_web_sm

# --- Import libraries ---
import nltk
from nltk.tokenize import sent_tokenize
import spacy

# Download NLTK data
nltk.download("punkt_tab")

# --- Text input ---
text = """Artificial Intelligence is transforming the world.
NLP is a key part of AI.
Machines are learning to understand human language."""

# --- NLTK Sentence Tokenization ---
print("NLTK Sentence Tokenization:")
print(sent_tokenize(text))

# --- spaCy Sentence Tokenization ---
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print("\nspaCy Sentence Tokenization:")
print([sent.text for sent in doc.sents])


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m65.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


NLTK Sentence Tokenization:
['Artificial Intelligence is transforming the world.', 'NLP is a key part of AI.', 'Machines are learning to understand human language.']

spaCy Sentence Tokenization:
['Artificial Intelligence is transforming the world. \n', 'NLP is a key part of AI. \n', 'Machines are learning to understand human language.']


In [None]:
# Task 2: Word Tokenization

In [5]:
import nltk
from nltk.tokenize import word_tokenize
import spacy

nltk.download("punkt_tab")

text = """Artificial Intelligence is transforming the world.
NLP is a key part of AI.
Machines are learning to understand human language."""

print("NLTK Word Tokens:")
print(word_tokenize(text))

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
tokens = [token.text for token in doc]
print("\nspaCy Word Tokens:")
print(tokens)
print("\nUnique tokens (spaCy):")
print(set(tokens))


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


NLTK Word Tokens:
['Artificial', 'Intelligence', 'is', 'transforming', 'the', 'world', '.', 'NLP', 'is', 'a', 'key', 'part', 'of', 'AI', '.', 'Machines', 'are', 'learning', 'to', 'understand', 'human', 'language', '.']

spaCy Word Tokens:
['Artificial', 'Intelligence', 'is', 'transforming', 'the', 'world', '.', '\n', 'NLP', 'is', 'a', 'key', 'part', 'of', 'AI', '.', '\n', 'Machines', 'are', 'learning', 'to', 'understand', 'human', 'language', '.']

Unique tokens (spaCy):
{'is', 'language', 'of', 'key', '\n', 'a', 'Machines', '.', 'NLP', 'are', 'to', 'Artificial', 'the', 'Intelligence', 'AI', 'part', 'understand', 'world', 'transforming', 'learning', 'human'}


In [None]:
# Part B: Stemming
# Task 3: Apply Stemming


In [6]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

words = ["playing", "played", "plays", "happily", "happiness", "better"]

stems = [stemmer.stem(w) for w in words]

print("Stemming Results:", stems)


Stemming Results: ['play', 'play', 'play', 'happili', 'happi', 'better']


In [None]:
# Part C: Lemmatization
# Task 4: Lemmatization with NLTK

In [7]:
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

print("running →", lemmatizer.lemmatize("running", pos="v"))
print("played →", lemmatizer.lemmatize("played", pos="v"))
print("happily →", lemmatizer.lemmatize("happily", pos="r"))
print("happiness →", lemmatizer.lemmatize("happiness", pos="n"))
print("better →", lemmatizer.lemmatize("better", pos="a"))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


running → run
played → play
happily → happily
happiness → happiness
better → good
