In [1]:
# Import libraries
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer
nltk.download('punkt')

# Initialize the stemmers
porter = PorterStemmer()
lancaster = LancasterStemmer()

# Example words
words = ["running", "ran", "runs", "easily", "fairly"]

# Apply stemming
print("Porter Stemmer:")
for word in words:
    print(f"{word} -> {porter.stem(word)}")

print("\nLancaster Stemmer:")
for word in words:
    print(f"{word} -> {lancaster.stem(word)}")


Porter Stemmer:
running -> run
ran -> ran
runs -> run
easily -> easili
fairly -> fairli

Lancaster Stemmer:
running -> run
ran -> ran
runs -> run
easily -> easy
fairly -> fair


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
example_string = """
Muad'Dib learned rapidly because his first training was in how to learn. And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""

In [4]:
sent_tokenize(example_string)


["\nMuad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn,\nand how many more believe learning to be difficult."]

In [5]:
word_tokenize(example_string)

["Muad'Dib",
 'learned',
 'rapidly',
 'because',
 'his',
 'first',
 'training',
 'was',
 'in',
 'how',
 'to',
 'learn',
 '.',
 'And',
 'the',
 'first',
 'lesson',
 'of',
 'all',
 'was',
 'the',
 'basic',
 'trust',
 'that',
 'he',
 'could',
 'learn',
 '.',
 'It',
 "'s",
 'shocking',
 'to',
 'find',
 'how',
 'many',
 'people',
 'do',
 'not',
 'believe',
 'they',
 'can',
 'learn',
 ',',
 'and',
 'how',
 'many',
 'more',
 'believe',
 'learning',
 'to',
 'be',
 'difficult',
 '.']

In [6]:
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
worf_quote = "Sir, I protest. I am not a merry man!"

In [8]:
words_in_quote = word_tokenize(worf_quote)
words_in_quote

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [9]:
stop_words = set(stopwords.words("english"))

In [10]:
filtered_list = []

In [11]:
filtered_list = [word for word in words_in_quote if word.casefold() not in stop_words]

In [12]:
filtered_list

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

Stemming

In [13]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()
string_for_stemming = """The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

In [14]:
words = word_tokenize(string_for_stemming)

In [15]:
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_words

['the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

Tagging Parts of Speech


In [16]:
from nltk.tokenize import word_tokenize

In [17]:
sagan_quote = """
If you wish to make an apple pie from scratch,you must first invent the universe."""
sagan_quote = """Sahil is living in Surat"""

In [18]:
words_in_sagan_quote = word_tokenize(sagan_quote)
words_in_sagan_quote

['Sahil', 'is', 'living', 'in', 'Surat']

In [20]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

import nltk
nltk.pos_tag(words_in_sagan_quote)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('Sahil', 'NNP'),
 ('is', 'VBZ'),
 ('living', 'VBG'),
 ('in', 'IN'),
 ('Surat', 'NNP')]

Lemmatizing

In [21]:
from nltk.stem import WordNetLemmatizer

In [22]:
lemmatizer = WordNetLemmatizer()

In [23]:
lemmatizer.lemmatize("scarves")

'scarf'

In [24]:
string_for_lemmatizing = "The friends of DeSoto love scarves."
string_for_lemmatizing = """The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

In [25]:
words = word_tokenize(string_for_lemmatizing)
words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discoveries',
 '.',
 'Discovering',
 'is',
 'what',
 'explorers',
 'do',
 '.']

In [26]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
lemmatized_words

['The',
 'crew',
 'of',
 'the',
 'USS',
 'Discovery',
 'discovered',
 'many',
 'discovery',
 '.',
 'Discovering',
 'is',
 'what',
 'explorer',
 'do',
 '.']

In [27]:
lemmatizer.lemmatize("worst")

'worst'

In [28]:
lemmatizer.lemmatize("worst", pos="a")

'bad'

Using Named Entity Recognition (NER)


In [29]:
# NE type	Examples
# ORGANIZATION	Georgia-Pacific Corp., WHO
# PERSON	Eddy Bonte, President Obama
# LOCATION	Murray River, Mount Everest
# DATE	June, 2008-06-29
# TIME	two fifty a m, 1:30 p.m.
# MONEY	175 million Canadian dollars, GBP 10.40
# PERCENT	twenty pct, 18.75 %
# FACILITY	Washington Monument, Stonehenge
# GPE	South East Asia, Midlothian

In [30]:
nltk.download("maxent_ne_chunker")
nltk.download("words")

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\henil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [31]:
quote = """
Men like Schiaparelli watched the red planet—it is odd, by-the-bye, that
for countless centuries Mars has been the star of war—but failed to
interpret the fluctuating appearances of the markings they mapped so well.
All that time the Martians must have been getting ready.
During the opposition of 1894 a great light was seen on the illuminated
part of the disk, first at the Lick Observatory, then by Perrotin of Nice,
and then by other observers. English readers heard of it first in the
issue of Nature dated August 2."""

In [32]:
def extract_ne(quote):
    words = word_tokenize(quote, language='english')
    tags = nltk.pos_tag(words)
    print(tags)
    tree = nltk.ne_chunk(tags, binary=True)
    # print(tree)
    return set(
        " ".join(i[0] for i in t)
        for t in tree
        if hasattr(t, "label") and t.label() == "NE"
        )

In [33]:
extract_ne(quote)

[('Men', 'NNS'), ('like', 'IN'), ('Schiaparelli', 'NNP'), ('watched', 'VBD'), ('the', 'DT'), ('red', 'JJ'), ('planet—it', 'NN'), ('is', 'VBZ'), ('odd', 'JJ'), (',', ','), ('by-the-bye', 'JJ'), (',', ','), ('that', 'IN'), ('for', 'IN'), ('countless', 'JJ'), ('centuries', 'NNS'), ('Mars', 'NNP'), ('has', 'VBZ'), ('been', 'VBN'), ('the', 'DT'), ('star', 'NN'), ('of', 'IN'), ('war—but', 'NN'), ('failed', 'VBD'), ('to', 'TO'), ('interpret', 'VB'), ('the', 'DT'), ('fluctuating', 'NN'), ('appearances', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('markings', 'NNS'), ('they', 'PRP'), ('mapped', 'VBD'), ('so', 'RB'), ('well', 'RB'), ('.', '.'), ('All', 'PDT'), ('that', 'DT'), ('time', 'NN'), ('the', 'DT'), ('Martians', 'NNPS'), ('must', 'MD'), ('have', 'VB'), ('been', 'VBN'), ('getting', 'VBG'), ('ready', 'JJ'), ('.', '.'), ('During', 'IN'), ('the', 'DT'), ('opposition', 'NN'), ('of', 'IN'), ('1894', 'CD'), ('a', 'DT'), ('great', 'JJ'), ('light', 'NN'), ('was', 'VBD'), ('seen', 'VBN'), ('on', 'IN'), (

{'Lick Observatory', 'Mars', 'Nature', 'Perrotin', 'Schiaparelli'}

NER using Spacy

In [41]:
# %pip install spacy
# !python -m spacy download en_core_web_sm


In [42]:
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Example text
text = "Apple Inc. is looking at buying U.K. startup for $1 billion. Barack Obama was the president of the United States."
text = "Shail is living in India. Sahil has 200 dollar in his account"

# Process the text
doc = nlp(text)
print(doc)


Shail is living in India. Sahil has 200 dollar in his account


In [43]:

# Print the entities detected in the text
for ent in doc.ents:
    print(ent.text, ent.label_)


Shail PERSON
India GPE
Sahil PERSON
200 dollar MONEY


Implementing BoW and TF-IDF in Python

In [44]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Example documents
docs = ["I love NLP", "NLP is great!"]

# Bag of Words (BoW) Representation
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(docs)
print("Bag of Words (BoW):")
print(bow_matrix.toarray())
print(vectorizer.get_feature_names_out())

# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)
print("\nTF-IDF:")
print(tfidf_matrix.toarray())
print(tfidf_vectorizer.get_feature_names_out())


Bag of Words (BoW):
[[0 0 1 1]
 [1 1 0 1]]
['great' 'is' 'love' 'nlp']

TF-IDF:
[[0.         0.         0.81480247 0.57973867]
 [0.6316672  0.6316672  0.         0.44943642]]
['great' 'is' 'love' 'nlp']
