<a href="https://colab.research.google.com/github/sreehari31580/Nlp-tutorial/blob/main/NLP_Text_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Text Processing: Tokenization, Stop Word Removal, Stemming, and Lemmatization
This notebook demonstrates basic NLP preprocessing steps using NLTK and spaCy.

In [1]:
# Install required packages
!pip install nltk spacy
!python -m nltk.downloader punkt stopwords wordnet
!python -m spacy download en_core_web_sm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m116.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Initialize Stemmer and Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [3]:
# Get user input
text = "The latest trends in AI include agentic AI, where AI systems collaborate to perform tasks independently. Also, AI is being used for intelligent process automation, cybersecurity enhancements, personalized services, automated AI development, and advancements in areas like autonomous vehicles and facial recognition. Furthermore, AI is increasingly being used in healthcare and is converging with the Internet of Things"

In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
# Tokenization
tokens = word_tokenize(text)
print("\nTokenized Words:")
print(tokens)


Tokenized Words:
['The', 'latest', 'trends', 'in', 'AI', 'include', 'agentic', 'AI', ',', 'where', 'AI', 'systems', 'collaborate', 'to', 'perform', 'tasks', 'independently', '.', 'Also', ',', 'AI', 'is', 'being', 'used', 'for', 'intelligent', 'process', 'automation', ',', 'cybersecurity', 'enhancements', ',', 'personalized', 'services', ',', 'automated', 'AI', 'development', ',', 'and', 'advancements', 'in', 'areas', 'like', 'autonomous', 'vehicles', 'and', 'facial', 'recognition', '.', 'Furthermore', ',', 'AI', 'is', 'increasingly', 'being', 'used', 'in', 'healthcare', 'and', 'is', 'converging', 'with', 'the', 'Internet', 'of', 'Things']


In [7]:
# Stop Word Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\nAfter Stop Word Removal:")
print(filtered_tokens)


After Stop Word Removal:
['latest', 'trends', 'AI', 'include', 'agentic', 'AI', ',', 'AI', 'systems', 'collaborate', 'perform', 'tasks', 'independently', '.', 'Also', ',', 'AI', 'used', 'intelligent', 'process', 'automation', ',', 'cybersecurity', 'enhancements', ',', 'personalized', 'services', ',', 'automated', 'AI', 'development', ',', 'advancements', 'areas', 'like', 'autonomous', 'vehicles', 'facial', 'recognition', '.', 'Furthermore', ',', 'AI', 'increasingly', 'used', 'healthcare', 'converging', 'Internet', 'Things']


In [8]:
# Stemming
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("\nAfter Stemming:")
print(stemmed_words)


After Stemming:
['latest', 'trend', 'ai', 'includ', 'agent', 'ai', ',', 'ai', 'system', 'collabor', 'perform', 'task', 'independ', '.', 'also', ',', 'ai', 'use', 'intellig', 'process', 'autom', ',', 'cybersecur', 'enhanc', ',', 'person', 'servic', ',', 'autom', 'ai', 'develop', ',', 'advanc', 'area', 'like', 'autonom', 'vehicl', 'facial', 'recognit', '.', 'furthermor', ',', 'ai', 'increasingli', 'use', 'healthcar', 'converg', 'internet', 'thing']


In [9]:
# Lemmatization using WordNetLemmatizer
lemmatized_words_nltk = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nAfter Lemmatization (NLTK):")
print(lemmatized_words_nltk)


After Lemmatization (NLTK):
['latest', 'trend', 'AI', 'include', 'agentic', 'AI', ',', 'AI', 'system', 'collaborate', 'perform', 'task', 'independently', '.', 'Also', ',', 'AI', 'used', 'intelligent', 'process', 'automation', ',', 'cybersecurity', 'enhancement', ',', 'personalized', 'service', ',', 'automated', 'AI', 'development', ',', 'advancement', 'area', 'like', 'autonomous', 'vehicle', 'facial', 'recognition', '.', 'Furthermore', ',', 'AI', 'increasingly', 'used', 'healthcare', 'converging', 'Internet', 'Things']


In [10]:
# Lemmatization using spaCy
doc = nlp(text)
lemmatized_words_spacy = [token.lemma_ for token in doc if token.text.lower() not in stop_words]
print("\nAfter Lemmatization (spaCy):")
print(lemmatized_words_spacy)


After Lemmatization (spaCy):
['late', 'trend', 'AI', 'include', 'agentic', 'AI', ',', 'AI', 'system', 'collaborate', 'perform', 'task', 'independently', '.', 'also', ',', 'AI', 'use', 'intelligent', 'process', 'automation', ',', 'cybersecurity', 'enhancement', ',', 'personalized', 'service', ',', 'automate', 'AI', 'development', ',', 'advancement', 'area', 'like', 'autonomous', 'vehicle', 'facial', 'recognition', '.', 'furthermore', ',', 'AI', 'increasingly', 'use', 'healthcare', 'converge', 'internet', 'thing']
