<a href="https://colab.research.google.com/github/sudama-inc/nlp/blob/main/nlp_removestopwords_bagofwords_tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Remove Stop Words**
> Libraries
*   NLTK
*   SpaCy
*   Sklearn

1.   Stop Words : **NLTK**

In [None]:
# 1. NLTK

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Sample text
text = "This is an example sentence with some stopwords."

In [None]:
# Tokenize the text
words = word_tokenize(text)
words

['This', 'is', 'an', 'example', 'sentence', 'with', 'some', 'stopwords', '.']

In [None]:
# Define a list of English stopwords
stop_words = set(stopwords.words("english"))
print(stop_words)

{'wouldn', 'whom', 'below', 'few', 'from', 'that', 'was', 'at', 'than', 'ain', 'an', 'her', "haven't", 'can', 'aren', 'nor', 'how', 'has', 'don', 'in', "aren't", "hadn't", 'to', 'before', "mightn't", 'him', 'been', 'where', 'yourselves', 'other', 'on', 'my', 'be', "shouldn't", 'when', 'above', 'up', 'y', 'haven', 'who', 'yourself', "you've", 'should', 'further', 'between', "hasn't", "needn't", 'down', 'by', 'once', 'with', "won't", "it's", 'they', 'hasn', 'she', 'he', 'our', 'did', "she's", 'but', 'or', 'there', "you'd", 'ma', 'just', 'were', 'having', 'is', 'during', "doesn't", 'mustn', 'about', 'off', 'will', 'didn', 'through', "should've", 'o', 'each', 'do', 'too', 'it', 'this', 'isn', 'for', 'their', 'only', 'have', 'of', "wasn't", 'more', 'here', 'those', 'own', 's', "wouldn't", "you're", 'out', 'its', 'what', 'these', 'any', 'until', 'd', 'doing', "mustn't", 'shan', 'ourselves', 'why', 've', 'doesn', 'won', 'you', 'into', 'couldn', "that'll", "you'll", 'which', 'am', 'shouldn', '

In [None]:
# Remove stopwords from the tokenized words
filtered_words = [word for word in words if word.lower() not in stop_words]
filtered_words

['example', 'sentence', 'stopwords', '.']

In [None]:
# Join the filtered words back into a sentence
filtered_text = " ".join(filtered_words)
print(filtered_text)

example sentence stopwords .


2.   Stop Words : **SpaCy**

In [None]:
import spacy

In [None]:
# Load the spaCy language model with stopwords
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x7c56694b72e0>

In [None]:
# Sample text
text = "This is an example sentence with some stopwords."

In [None]:
# Process the text with spaCy
doc = nlp(text)
doc

This is an example sentence with some stopwords.

In [None]:
# Remove stopwords and join the remaining words
filtered_text_spacy = " ".join([token.text for token in doc if not token.is_stop])
print(filtered_text_spacy)

example sentence stopwords .


3.   Stop Words : **Using scikit-learn's CountVectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Sample text data
documents = ["This is an example sentence with some stopwords.", "Another example with stopwords."]

In [None]:
# Create an instance of CountVectorizer with stop words removed
vectorizer = CountVectorizer(stop_words='english')
vectorizer

In [None]:
# Fit the vectorizer on your text data and transform the text into a document-term matrix
X = vectorizer.fit_transform(documents)
X.toarray()

array([[1, 1, 1],
       [1, 0, 1]])

In [None]:
# Get the list of terms (words)
terms = vectorizer.get_feature_names_out()
terms

array(['example', 'sentence', 'stopwords'], dtype=object)

**Bag of Words**
> Libraries
*   NLTK
*   SpaCy
*   Sklearn

1.   Bag-of-Words : **Using scikit-learn's CountVectorizer**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on your text data and transform the text into a document-term matrix
X = vectorizer.fit_transform(documents)

# X is a sparse matrix where rows represent documents, and columns represent terms
# You can convert it to a dense array for better readability
X_dense = X.toarray()

# Get the list of terms (words)
terms = vectorizer.get_feature_names_out()

# Print the document-term matrix
print("Document-Term Matrix:")
print(X_dense)
print("-"*30)
# Print the list of terms
print("List of Terms (Words):")
print(terms)


Document-Term Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
------------------------------
List of Terms (Words):
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


2.   Bag-of-Words : **Using NLTK**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Sample text data
text = "This is a simple example for creating a Bag of Words representation using NLTK."

# Tokenize the text
words = word_tokenize(text)

# Calculate word frequencies
freq_dist = FreqDist(words)

# Get the list of terms (words)
terms = list(freq_dist.keys())

# Print the document-term matrix (word frequencies)
print("Document-Term Matrix (Word Frequencies):")
for term in terms:
    print(f"{term}: {freq_dist[term]}")

print("-"*30)

# Print the list of terms
print("List of Terms (Words):")
print(terms)


Document-Term Matrix (Word Frequencies):
This: 1
is: 1
a: 2
simple: 1
example: 1
for: 1
creating: 1
Bag: 1
of: 1
Words: 1
representation: 1
using: 1
NLTK: 1
.: 1
------------------------------
List of Terms (Words):
['This', 'is', 'a', 'simple', 'example', 'for', 'creating', 'Bag', 'of', 'Words', 'representation', 'using', 'NLTK', '.']


3.   Bag-of-Words : **Using SpaCy**

In [None]:
import spacy

# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "This is a simple example for creating a Bag of Words representation using spaCy."

# Process the text with spaCy
doc = nlp(text)

# Get the list of terms (lemmatized words) without stopwords
terms = [token.lemma_ for token in doc if not token.is_stop]

# Print the list of terms
print("List of Terms (Words):")
print(terms)


List of Terms (Words):
['simple', 'example', 'create', 'Bag', 'Words', 'representation', 'spacy', '.']


**TF-IDF**
> Libraries
*   NLTK
*   SpaCy
*   Sklearn

1.   TF-IDF : **Using scikit-learn's TfidfVectorizer**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample text data
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create an instance of TfidfVectorizer
vertorizer = TfidfVectorizer()

# Fit the vectorizer on your text data and transform the text into a TF-IDF matrix
X = vertorizer.fit_transform(documents)

# X is a sparse matrix where rows represent documents, and columns represent terms
# You can convert it to a dense array for better readability
X_dense = X.toarray()

# Get the list of terms (words)
terms = vectorizer.get_feature_names_out()

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(X_dense)

print('-'*50)

# Print the list of terms
print("List of Terms (Words):")
print(terms)

TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
--------------------------------------------------
List of Terms (Words):
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


2.   TF-IDF : **Using NLTK & scikit-learn**

In [None]:
from nltk.tokenize import word_tokenize

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Sample text data
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Tokenize the text using NLTK

tokenized_documents = [' '.join(word_tokenize(doc)) for doc in documents]
print(tokenized_documents)

# Fit the vectorizer on your tokenized text data and transform it into a TF-IDF matrix
X = vectorizer.fit_transform(tokenized_documents)

# X is a sparse matrix where rows represent documents, and columns represent terms
# You can convert it to a dense array for better readability
X_dense = X.toarray()

# Get the list of terms (words)
terms = vectorizer.get_feature_names_out()

print('-'*50)

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(X_dense)

print('-'*50)

# Print the list of terms
print("List of Terms (Words):")
print(terms)

['This is the first document .', 'This document is the second document .', 'And this is the third one .', 'Is this the first document ?']
--------------------------------------------------
TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
--------------------------------------------------
List of Terms (Words):
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


3.   Bag-of-Words : **Using SpaCy & scikit-learn**

In [None]:
# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Sample text
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Process the text with spaCy and extract lemmatized words
tokenized_documents = [' '.join([token.lemma_ for token in nlp(doc) if not token.is_stop]) for doc in documents]
print('Tokenized Documents : ')
print(tokenized_documents)

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on your tokenized text data and transform it into a TF-IDF matrix
X = vectorizer.fit_transform(tokenized_documents)

# X is a sparse matrix where rows represent documents, and columns represent terms
# You can convert it to a dense array for better readability
X_dense = X.toarray()

# Get the list of terms (words)
terms = vectorizer.get_feature_names_out()

print('-'*50)

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(X_dense)

print('-'*50)

# Print the list of terms
print("List of Terms (Words):")
print(terms)

Tokenized Documents : 
['document .', 'document second document .', '.', 'document ?']
--------------------------------------------------
TF-IDF Matrix:
[[1.         0.        ]
 [0.78722298 0.61666846]
 [0.         0.        ]
 [1.         0.        ]]
--------------------------------------------------
List of Terms (Words):
['document' 'second']


**Stemming & Lemmatization**
> Libraries
*   NLTK
*   SpaCy

1.   Stemming & Lemmatization : **Using NLTK**

In [None]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

# Sample text data
text = "Stemming and lemmatization are text processing techniques used for text analysis."

# Tokenize the text
words = word_tokenize(text)
print(words)

print('-'*50)

# Stemming using the Porter Stemmer
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words]
print("Stemming using the Porter Stemmer : ")
print(stemmed_words)
print('-'*50)

# Lemmatization using the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("Lemmatization using the WordNet Lemmatizer : ")
print(lemmatized_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


['Stemming', 'and', 'lemmatization', 'are', 'text', 'processing', 'techniques', 'used', 'for', 'text', 'analysis', '.']
--------------------------------------------------
Stemming using the Porter Stemmer : 
['stem', 'and', 'lemmat', 'are', 'text', 'process', 'techniqu', 'use', 'for', 'text', 'analysi', '.']
--------------------------------------------------
Lemmatization using the WordNet Lemmatizer : 
['Stemming', 'and', 'lemmatization', 'are', 'text', 'processing', 'technique', 'used', 'for', 'text', 'analysis', '.']


2.   Lemmatization : **Using SpaCy**

In [None]:
# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Stemming and lemmatization are text processing techniques used for text analysis."

# Process the text with spaCy
doc = nlp(text)

# Lemmatization using spaCy
lemmatized_words = [tiken.lemma_ for tiken in doc]

# Print the lemmatized words
print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['stemming', 'and', 'lemmatization', 'be', 'text', 'processing', 'technique', 'use', 'for', 'text', 'analysis', '.']
