<a href="https://colab.research.google.com/github/saxonmahar/Deeplearning/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
text = "Natural Language Processing helps computers understand human language. It powers chatbots, voice assistants, and more."

print("Original Text:\n", text)

Original Text:
 Natural Language Processing helps computers understand human language. It powers chatbots, voice assistants, and more.


In [4]:
tokens = word_tokenize(text)
print("Tokens:\n", tokens)

Tokens:
 ['Natural', 'Language', 'Processing', 'helps', 'computers', 'understand', 'human', 'language', '.', 'It', 'powers', 'chatbots', ',', 'voice', 'assistants', ',', 'and', 'more', '.']


In [5]:
# Step 4: Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("After Stopword Removal:\n", filtered_tokens)


After Stopword Removal:
 ['Natural', 'Language', 'Processing', 'helps', 'computers', 'understand', 'human', 'language', '.', 'powers', 'chatbots', ',', 'voice', 'assistants', ',', '.']


In [6]:
# Step 5: Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Words:\n", stemmed_words)


Stemmed Words:
 ['natur', 'languag', 'process', 'help', 'comput', 'understand', 'human', 'languag', '.', 'power', 'chatbot', ',', 'voic', 'assist', ',', '.']


In [7]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Words:\n", lemmatized_words)


Lemmatized Words:
 ['Natural', 'Language', 'Processing', 'help', 'computer', 'understand', 'human', 'language', '.', 'power', 'chatbots', ',', 'voice', 'assistant', ',', '.']


In [8]:
#  Step 7: Vectorization – Bag of Words (BoW)
corpus = [
    "NLP makes machines understand language.",
    "Voice assistants and chatbots are NLP applications.",
    "Text can be transformed into vectors using TF-IDF."
]

vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(corpus)
print("Bag of Words Feature Names:\n", vectorizer.get_feature_names_out())
print("BoW Matrix:\n", X_bow.toarray())


Bag of Words Feature Names:
 ['and' 'applications' 'are' 'assistants' 'be' 'can' 'chatbots' 'idf'
 'into' 'language' 'machines' 'makes' 'nlp' 'text' 'tf' 'transformed'
 'understand' 'using' 'vectors' 'voice']
BoW Matrix:
 [[0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0]
 [1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1]
 [0 0 0 0 1 1 0 1 1 0 0 0 0 1 1 1 0 1 1 0]]


In [9]:
# Step 8: Vectorization – TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(corpus)
print("TF-IDF Feature Names:\n", tfidf.get_feature_names_out())
print("TF-IDF Matrix:\n", X_tfidf.toarray())


TF-IDF Feature Names:
 ['and' 'applications' 'are' 'assistants' 'be' 'can' 'chatbots' 'idf'
 'into' 'language' 'machines' 'makes' 'nlp' 'text' 'tf' 'transformed'
 'understand' 'using' 'vectors' 'voice']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.46735098 0.46735098 0.46735098
  0.35543247 0.         0.         0.         0.46735098 0.
  0.         0.        ]
 [0.38988801 0.38988801 0.38988801 0.38988801 0.         0.
  0.38988801 0.         0.         0.         0.         0.
  0.29651988 0.         0.         0.         0.         0.
  0.         0.38988801]
 [0.         0.         0.         0.         0.33333333 0.33333333
  0.         0.33333333 0.33333333 0.         0.         0.
  0.         0.33333333 0.33333333 0.33333333 0.         0.33333333
  0.33333333 0.        ]]
