##### Downloading punkt tokenizer models 

In [29]:
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prady\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Importing modules

In [30]:
import time  # Timer module for performance measurement
import wikipediaapi  # Wikipedia API to fetch text
import spacy  # NLP library for lemmatization
from nltk.stem import PorterStemmer  # Stemming module from nltk
from nltk.tokenize import word_tokenize  # Tokenization modul
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

#### Fetching Wikipedia Content

In [31]:
# Step 1: Fetch Wikipedia content
def get_wikipedia_text(page_title):
    wiki_wiki = wikipediaapi.Wikipedia(user_agent="MyNLPProject/1.0", language="en")
    page = wiki_wiki.page(page_title)
    return page.summary if page.exists() else ""
text = get_wikipedia_text("Keshav_Memorial_Institute_of_Technology")

#### Tokenization

In [32]:

tokens = tokenizer.tokenize(text)

#### Applying stemming using PortStemmer in NLTK

In [33]:
# Step 2: Apply Stemming
stemmer = PorterStemmer()
start_stem = time.time()  # Start timer
stemmed_words = [stemmer.stem(word) for word in tokens]
end_stem = time.time()  # End timer
stemmed_words

['keshav',
 'memori',
 'institut',
 'of',
 'technolog',
 'is',
 'a',
 'privat',
 'engin',
 'colleg',
 'in',
 'hyderabad',
 'in',
 'telangana',
 ',',
 'india.',
 'it',
 'offer',
 'b.tech',
 'degre',
 'in',
 'comput',
 'scienc',
 'and',
 'engin',
 ',',
 'artifici',
 'intellig',
 'and',
 'machin',
 'learn',
 ',',
 'data',
 'scienc',
 ',',
 'and',
 'inform',
 'technolog',
 '.']

#### Applying Lemmitization using spaCy

In [34]:
# Step 3: Apply Lemmatization
nlp = spacy.load("en_core_web_sm")
start_lem = time.time()  # Start timer
doc = nlp(" ".join(tokens))
lemmatized_words = [token.lemma_ for token in doc]
end_lem = time.time()  # End timer
lemmatized_words

['Keshav',
 'Memorial',
 'Institute',
 'of',
 'Technology',
 'be',
 'a',
 'private',
 'engineering',
 'college',
 'in',
 'Hyderabad',
 'in',
 'Telangana',
 ',',
 'India',
 '.',
 'it',
 'offer',
 'B.Tech',
 'degree',
 'in',
 'computer',
 'science',
 'and',
 'engineering',
 ',',
 'artificial',
 'intelligence',
 'and',
 'machine',
 'learning',
 ',',
 'data',
 'science',
 ',',
 'and',
 'information',
 'technology',
 '.']

#### Displaying results

In [35]:
# Step 4: Display Results
print("Original Text Sample:", tokens[:10])
print("Stemmed Words:", stemmed_words[:10])
print("Lemmatized Words:", lemmatized_words[:10])
# Step 5: Performance Comparison
print("\nPerformance Analysis:")
print(f"Stemming Execution Time: {end_stem - start_stem:.5f} seconds")
print(f"Lemmatization Execution Time: {end_lem - start_lem:.5f} seconds")

Original Text Sample: ['Keshav', 'Memorial', 'Institute', 'of', 'Technology', 'is', 'a', 'private', 'engineering', 'college']
Stemmed Words: ['keshav', 'memori', 'institut', 'of', 'technolog', 'is', 'a', 'privat', 'engin', 'colleg']
Lemmatized Words: ['Keshav', 'Memorial', 'Institute', 'of', 'Technology', 'be', 'a', 'private', 'engineering', 'college']

Performance Analysis:
Stemming Execution Time: 0.00100 seconds
Lemmatization Execution Time: 0.01400 seconds


#### Applying Stemming using Lancaster Stemming in NLTK

In [36]:
from nltk.stem import LancasterStemmer
import time

# Step 2: Apply Lancaster Stemming
stemmer = LancasterStemmer()
start_stem = time.time()  # Start timer
stemmed_words = [stemmer.stem(word) for word in tokens]
end_stem = time.time()  # End timer
print("Stemmed Words:", stemmed_words[:10])
print(f"Stemming Execution Time: {end_stem - start_stem:.5f} seconds")



Stemmed Words: ['keshav', 'mem', 'institut', 'of', 'technolog', 'is', 'a', 'priv', 'engin', 'colleg']
Stemming Execution Time: 0.00100 seconds


#### Applying Stemming using Snowball Stemmer in NLTK

In [37]:
from nltk.stem import SnowballStemmer
import time

# Step 2: Apply Snowball Stemming
stemmer = SnowballStemmer("english")
start_stem = time.time()  # Start timer
stemmed_words = [stemmer.stem(word) for word in tokens]
end_stem = time.time()  # End timer
print("Stemmed Words:", stemmed_words[:10])
print(f"Stemming Execution Time: {end_stem - start_stem:.5f} seconds")

Stemmed Words: ['keshav', 'memori', 'institut', 'of', 'technolog', 'is', 'a', 'privat', 'engin', 'colleg']
Stemming Execution Time: 0.00100 seconds


#### Applying Lemmitization using NLTK

In [39]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
start_stem = time.time() 
# Apply Lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
end_stem = time.time()  # End timer

print("Lemmatized Words:", lemmatized_words[:10])
print(f"Lemmatization Execution Time: {end_stem - start_stem:.5f} seconds")

Lemmatized Words: ['Keshav', 'Memorial', 'Institute', 'of', 'Technology', 'is', 'a', 'private', 'engineering', 'college']
Lemmatization Execution Time: 0.00000 seconds
