<a href="https://colab.research.google.com/github/sahanyafernando/My_NLP_Learning/blob/main/RuleBasedStemmingAndPorterStemmer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rule Based Stemming and Porter Stemmer

In [1]:
import pandas as pd
# pandas is imported to handle data manipulation and analysis, particularly for reading the CSV file containing customer or text data.
import nltk
# nltk (Natural Language Toolkit) is imported to provide tools for text processing, including tokenization, stemming, and linguistic analysis.
from nltk.tokenize import word_tokenize
# word_tokenize is used to split text into individual words (tokens), which is a prerequisite for many NLP tasks such as stemming and frequency analysis.
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
# These are various stemming classes from NLTK used to reduce words to their root or base form, helping normalize different word variations.
from string import punctuation
# string.punctuation is imported to provide a predefined list of punctuation characters, which is useful for removing punctuation during text cleaning.


In [2]:
# Download the necessary NLTK data for tokenization
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# Read the CSV file
df = pd.read_csv('test_dataset.csv')
df.head()

Unnamed: 0,id,sentence
0,1,The cats are chasing the mice
1,2,Dogs were barking loudly in the streets
2,3,He is running faster than others
3,4,She studies computer science at the university
4,5,The children are playing in the playground


In [4]:
# Define a custom rule-based stemmer that removes punctuation and common suffixes.
def rule_based_stem(word):
  word = word.strip(punctuation) # Clean the word from punctuation.
  # Check and remove common suffixes if the word (without the suffix) remains longer than 2 characters
  for suffix in ["ing", "ed", "ly", "s", "es"]:
      if word.endswith(suffix) and len(word) - len(suffix) > 2:
        return word[:-len(suffix)]
  return word


In [5]:
# Define a minimal custom Lovins stemmer by removing a lsit of common sufixes.
lovins_suffixes = sorted(["ization", "ational", "fulness", "iveness", "ousness", "tional", "ing", "ed", "ly", "s"], key = len, reverse = True)
def lovins_stem(word):
    word = word.lower().strip(punctuation) # Convert to lowercase and remove punctuations
    for suffix in lovins_suffixes:
        if word.endswith(suffix) and len(word)-len(suffix) > 2:
            return word[:-len(suffix)]
    return word

In [6]:
# Initialize the NLTK stemmers.
porter = PorterStemmer() # For Porter stemming
snowball = SnowballStemmer('english') # For Snowball stemming
lancaster = LancasterStemmer() # For Lancaster stemming

In [7]:
# Process and print the first two reviews for each stemming method.
# For each review, tokenize the text, apply the coresponding stemmer to every token, and print the results.

In [8]:
# Rule Based Stemming
print("Rule Based Stemming Output for data:")
for index, sentence in df['sentence'].head(5).items():
    tokens = word_tokenize(sentence) # Tokenize the sentence into words
    stemmed_tokens = [rule_based_stem(token) for token in tokens] # Apply custom rule-based stemmer
    print(f"Original: {sentence}")
    print(f"Stemmed: {' '.join(stemmed_tokens)}\n") # Join tokens back to a string for display


Rule Based Stemming Output for data:
Original: The cats are chasing the mice
Stemmed: The cat are chas the mice

Original: Dogs were barking loudly in the streets
Stemmed: Dog were bark loud in the street

Original: He is running faster than others
Stemmed: He is runn faster than other

Original: She studies computer science at the university
Stemmed: She studie computer science at the university

Original: The children are playing in the playground
Stemmed: The children are play in the playground



# Snowball, Lancaster and Lovins

In [9]:
import pandas as pd # For loading and manipulating CSV data
import nltk # For NLP tasks such as Tokenization and POS tagging
from nltk.tokenize import  word_tokenize
from nltk.stem import WordNetLemmatizer # To perform lemmatization using the WordNet database
from nltk.corpus import wordnet # To map POS tags to the format required by WordNetLemmatizer
from textblob import TextBlob # High-level NLP library that simplifies lemmatization among other tasks
import spacy # Powerful NLP library with its own lemmatizer based on neural networks
!pip install stanza # Download stanza because it is not in google colab b default
import stanza # Neural NLP toolkit for tokenization, POS tagging and Lemmatization

Collecting stanza
  Downloading stanza-1.11.0-py3-none-any.whl.metadata (14 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading stanza-1.11.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.15.0-py3-none-any.whl (608 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, stanza
Successfully installed emoji-2.15.0 stanza-1.11.0


In [10]:
# Download neccessary NLTK data for tokenization, POS tagging and Lemmatization
nltk.download('punkt') # Data for word_tokenize
nltk.download('punkt_tab')
nltk.download('wordnet') # WordNet database for lemmatization
nltk.download('averaged_perceptron_tagger') # POS tagger data for accurate tagging
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [11]:
# Load spaCy's small English model which includes a lemmatizer
nlp_spacy = spacy.load("en_core_web_sm") # Load spaCy model for English

In [None]:
# Download and initialize Stanza's English pipeline for neural-based lemmatization
stanza.download('en') # Download Stanza's English models (only needed once)
nlp_stanza = stanza.Pipeline('en', processors='tokenize,pos,lemma', use_gpu=False) # Initialize Stanza pipeline


In [13]:
# Read the CSV file
df = pd.read_csv('test_dataset.csv')
df.head()

Unnamed: 0,id,sentence
0,1,The cats are chasing the mice
1,2,Dogs were barking loudly in the streets
2,3,He is running faster than others
3,4,She studies computer science at the university
4,5,The children are playing in the playground


In [14]:
# Function to convert NLTK POS tags to WordNet POS tags required by WordNetLemmatizer
def get_wordnt_pos(tag):
  # This function checks the starting letter of the POS tag and returns the corresponding WordNet tag.
  if tag.startswith('J'):
    return wordnet.ADJ # Adjective
  elif tag.startswith('V'):
    return wordnet.VERB # Verb
  elif tag.startswith('N'):
    return wordnet.NOUN # Noun
  elif tag.startswith('R'):
    return wordnet.ADV # Adverb
  else:
    return wordnet.NOUN # Default to noun if no matching tag is found


In [15]:
# Lemmatization using WordNet with POS Tagging (NLTK)
lemmatizer = WordNetLemmatizer() # Initialize the WordNet lemmatizer
# Process each sentence from the sentence colunm
for index, sentence in df['sentence'].head(5).items():
  tokens = word_tokenize(sentence) # Tokenize sentence to words
  pos_tags = nltk.pos_tag(tokens) # Get POS tags for each token
  # Lemmatize each token using its corresponding POS tag
  lemmatized_words = [lemmatizer.lemmatize(token, get_wordnt_pos(pos)) for token, pos in pos_tags]
  print(f"Original: {sentence}") # Display Original Sentence
  print(f"Lemmatized: {' '.join(lemmatized_words)}\n") # Display Lemmatized Sentence

Original: The cats are chasing the mice
Lemmatized: The cat be chase the mouse

Original: Dogs were barking loudly in the streets
Lemmatized: Dogs be bark loudly in the street

Original: He is running faster than others
Lemmatized: He be run faster than others

Original: She studies computer science at the university
Lemmatized: She study computer science at the university

Original: The children are playing in the playground
Lemmatized: The child be play in the playground



In [16]:
# Lemmatiation usin spaCy
# Process each sentence from the sentence cloumn using NLP pipeline
for index, sentence in df['sentence'].head(5).items():
  doc = nlp_spacy(sentence) # Process sentence text into a spaCy document
  spacy_lemmas = [token.lemma_ for token in doc] # Extract lemmas for each token from the spaCy document
  print(f"Original: {sentence}") # Display Original Sentence
  print(f"Lemmatized: {' '.join(spacy_lemmas)}\n") # Display Lemmatized Sentence

Original: The cats are chasing the mice
Lemmatized: the cat be chase the mouse

Original: Dogs were barking loudly in the streets
Lemmatized: Dogs be bark loudly in the street

Original: He is running faster than others
Lemmatized: he be run fast than other

Original: She studies computer science at the university
Lemmatized: she study computer science at the university

Original: The children are playing in the playground
Lemmatized: the child be play in the playground



In [17]:
# Snowball Stemming
print("Snowball Stemming Output for reviews:")
for index, sentence in df['sentence'].head(5).items():
  tokens = word_tokenize(sentence) # Tokenize sentence to words
  snowball_stems = [snowball.stem(token) for token in tokens] # Apply Snowball stemmer to each token
  print(f"Original: {sentence}") # Display Original Sentence
  print(f"Stemmed: {' '.join(snowball_stems)}\n")

Snowball Stemming Output for reviews:
Original: The cats are chasing the mice
Stemmed: the cat are chase the mice

Original: Dogs were barking loudly in the streets
Stemmed: dog were bark loud in the street

Original: He is running faster than others
Stemmed: he is run faster than other

Original: She studies computer science at the university
Stemmed: she studi comput scienc at the univers

Original: The children are playing in the playground
Stemmed: the children are play in the playground



In [18]:
# Lancaster Stemming
print("Lancaster Stemming Output for reviews:")
for index, sentence in df['sentence'].head(5).items():
  tokens = word_tokenize(sentence) # Tokenize sentence to words
  lancaster_stems = [lancaster.stem(token) for token in tokens]
  print(f"Original: {sentence}") # Display Original Sentence
  print(f"Stemmed: {' '.join(lancaster_stems)}\n")

Lancaster Stemming Output for reviews:
Original: The cats are chasing the mice
Stemmed: the cat ar chas the mic

Original: Dogs were barking loudly in the streets
Stemmed: dog wer bark loud in the streets

Original: He is running faster than others
Stemmed: he is run fast than oth

Original: She studies computer science at the university
Stemmed: she study comput sci at the univers

Original: The children are playing in the playground
Stemmed: the childr ar play in the playground



In [19]:
# Lovins Stemming
print("Lovins Stemming Output for reviews:")
for index, sentence in df['sentence'].head(5).items():
  tokens = word_tokenize(sentence) # Tokenize
  lovins_stems = [lovins_stem(token) for token in tokens]
  print(f"Original: {sentence}") # Display Original Sentence
  print(f"Stemmed: {' '.join(lovins_stems)}\n")


Lovins Stemming Output for reviews:
Original: The cats are chasing the mice
Stemmed: the cat are chas the mice

Original: Dogs were barking loudly in the streets
Stemmed: dog were bark loud in the street

Original: He is running faster than others
Stemmed: he is runn faster than other

Original: She studies computer science at the university
Stemmed: she studie computer science at the university

Original: The children are playing in the playground
Stemmed: the children are play in the playground



## Demonstration: Text Blob, WordNet, and Neural Lemmatizer using Stanza

In [20]:
# Lemmatization using TextBlob
# Process each sentence from the sentence column using TextBlob
for index, sentence in df['sentence'].head(5).items():
  blob = TextBlob(sentence) # Create a TextBlob object for each sentence
  textblob_lemmas = [word.lemmatize() for word in blob.words] # Lemmatize each word in the TextBlob
  print(f"Original: {sentence}") # Display Original Sentence
  print(f"Lemmatized: {' '.join(textblob_lemmas)}\n") # Display the lemmatized version

Original: The cats are chasing the mice
Lemmatized: The cat are chasing the mouse

Original: Dogs were barking loudly in the streets
Lemmatized: Dogs were barking loudly in the street

Original: He is running faster than others
Lemmatized: He is running faster than others

Original: She studies computer science at the university
Lemmatized: She study computer science at the university

Original: The children are playing in the playground
Lemmatized: The child are playing in the playground



In [21]:
# Standard NLTK WordNet Lemmatizer (Default as Noun)
# Process each sentence from the sentence column without POS tagging (assumes each word is a noun)
for index, sentence in df['sentence'].head(5).items():
  tokens = word_tokenize(sentence) # Tokenize sentence to words
  # Lemmatize each tokens assuming it is a nown (default behaviour)
  nltk_lemmas = [lemmatizer.lemmatize(token) for token in tokens]
  print(f"Original: {sentence}") # Display Original Sentence
  print(f"Lemmatized: {' '.join(nltk_lemmas)}\n") # Display lemmatized version

Original: The cats are chasing the mice
Lemmatized: The cat are chasing the mouse

Original: Dogs were barking loudly in the streets
Lemmatized: Dogs were barking loudly in the street

Original: He is running faster than others
Lemmatized: He is running faster than others

Original: She studies computer science at the university
Lemmatized: She study computer science at the university

Original: The children are playing in the playground
Lemmatized: The child are playing in the playground



In [23]:
# Neural Lemmatization using Stanza
# Process each sentence from the sentence column using Stanza's neural pipeline
for index, sentence in df['sentence'].head(5).items():
  doc = nlp_stanza(sentence) # Process sentence text with Stanza
  # Extract lemmas from every word in each sentence of the processed document
  stanza_lemmas = [word.lemma for sentence in doc.sentences for word in sentence.words]
  print(f"Original: {sentence}") # Display Original Sentence
  print(f"Lemmatized: {' '.join(stanza_lemmas)}\n") # Display lemmatized version

Original: The cats are chasing the mice
Lemmatized: the cat be chase the mouse

Original: Dogs were barking loudly in the streets
Lemmatized: dog be bark loudly in the street

Original: He is running faster than others
Lemmatized: he be run fast than other

Original: She studies computer science at the university
Lemmatized: she study computer science at the university

Original: The children are playing in the playground
Lemmatized: the child be play in the playground

