## program 3
 Download Wikipedia's page on open source and convert the text to its native forms. Try it with various stemming and lemmatization modules. Use Python's timer module to measure their performance.

In [9]:
import wikipediaapi
import time
import nltk
import spacy
from nltk.stem import PorterStemmer, LancasterStemmer,SnowballStemmer
from nltk.stem import WordNetLemmatizer


In [10]:
# Download necessary resources
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tangu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tangu\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tangu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# Fetch Wikipedia page
def get_wikipedia_page(title):
    user_agent = "MyWikipediaBot/1.0"
    wiki_wiki = wikipediaapi.Wikipedia(user_agent=user_agent, language='en')
    page = wiki_wiki.page(title)
    return page.text if page.exists() else None

In [12]:
def tokenize(text):
    return nltk.word_tokenize(text)


In [21]:
# Apply stemming and lemmatization with performance measurement
def process_text(tokens):
    ps = PorterStemmer()
    ls = LancasterStemmer()
    lemmatizer = WordNetLemmatizer()
    nlp = spacy.load("en_core_web_sm")
    ss = SnowballStemmer("english")  # Initialize Snowball Stemmer

    # Measure time for SnowballStemmer
    start = time.time()
    snowball_stems = [ss.stem(word) for word in tokens]
    snowball_time = time.time() - start

    
    # Measure time for PorterStemmer
    start = time.time()
    porter_stems = [ps.stem(word) for word in tokens]
    porter_time = time.time() - start

    # Measure time for LancasterStemmer
    start = time.time()
    lancaster_stems = [ls.stem(word) for word in tokens]
    lancaster_time = time.time() - start

    # Measure time for WordNet Lemmatizer
    start = time.time()
    wordnet_lems = [lemmatizer.lemmatize(word) for word in tokens]
    wordnet_time = time.time() - start

    # Measure time for Spacy Lemmatizer
    start = time.time()
    doc = nlp(" ".join(tokens))
    spacy_lems = [token.lemma_ for token in doc]
    spacy_time = time.time() - start


     # Print reduced tokens
    print("\nReduced Tokens:")
    print("\nPorterStemmer:", porter_stems[:50])  # Print first 50 words for readability
    print("\nLancasterStemmer:", lancaster_stems[:50])
    print("\nSnowballStemmer:", snowball_stems[:50])
    print("\nWordNet Lemmatizer:", wordnet_lems[:50])
    print(tokens[:50])
    print("\nSpacy Lemmatizer:", spacy_lems[:50])

    print('\n\n\n\n')

    # Print results
    print(f"PorterStemmer Time: {porter_time:.4f} sec")
    print(f"LancasterStemmer Time: {lancaster_time:.4f} sec")
    print(f"SnowballStemmer Time: {snowball_time:.4f} sec")
    print(f"WordNetLemmatizer Time: {wordnet_time:.4f} sec")
    print(f"Spacy Lemmatizer Time: {spacy_time:.4f} sec")


In [23]:
text = get_wikipedia_page("MS_Dhoni")
tokens = tokenize(text)
process_text(tokens)


Reduced Tokens:

PorterStemmer: ['mahendra', 'singh', 'dhoni', '(', ';', 'born', '7', 'juli', '1981', ')', 'is', 'an', 'indian', 'profession', 'cricket', 'who', 'play', 'as', 'a', 'right-hand', 'batter', 'and', 'a', 'wicket-keep', '.', 'wide', 'regard', 'as', 'one', 'of', 'the', 'most', 'prolif', 'wicket-keep', 'batsmen', 'and', 'captain', ',', 'he', 'repres', 'the', 'indian', 'cricket', 'team', 'and', 'wa', 'the', 'captain', 'of', 'the']

LancasterStemmer: ['mahendr', 'singh', 'dhon', '(', ';', 'born', '7', 'july', '1981', ')', 'is', 'an', 'ind', 'profess', 'cricket', 'who', 'play', 'as', 'a', 'right-handed', 'bat', 'and', 'a', 'wicket-keeper', '.', 'wid', 'regard', 'as', 'on', 'of', 'the', 'most', 'prol', 'wicket-keeper', 'batsm', 'and', 'captain', ',', 'he', 'repres', 'the', 'ind', 'cricket', 'team', 'and', 'was', 'the', 'captain', 'of', 'the']

SnowballStemmer: ['mahendra', 'singh', 'dhoni', '(', ';', 'born', '7', 'juli', '1981', ')', 'is', 'an', 'indian', 'profession', 'cricket',