<a href="https://colab.research.google.com/github/sahanyafernando/My_NLP_Learning/blob/main/RuleBasedStemmingAndPorterStemmer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rule Based Stemming and Porter Stemmer

In [12]:
import pandas as pd
# pandas is imported to handle data manipulation and analysis, particularly for reading the CSV file containing customer or text data.
import nltk
# nltk (Natural Language Toolkit) is imported to provide tools for text processing, including tokenization, stemming, and linguistic analysis.
from nltk.tokenize import word_tokenize
# word_tokenize is used to split text into individual words (tokens), which is a prerequisite for many NLP tasks such as stemming and frequency analysis.
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
# These are various stemming classes from NLTK used to reduce words to their root or base form, helping normalize different word variations.
from string import punctuation
# string.punctuation is imported to provide a predefined list of punctuation characters, which is useful for removing punctuation during text cleaning.


In [13]:
# Download the necessary NLTK data for tokenization
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [14]:
# Read the CSV file
df = pd.read_csv('test_dataset.csv')
df.head()

Unnamed: 0,id,sentence
0,1,The cats are chasing the mice
1,2,Dogs were barking loudly in the streets
2,3,He is running faster than others
3,4,She studies computer science at the university
4,5,The children are playing in the playground


In [15]:
# Define a custom rule-based stemmer that removes punctuation and common suffixes.
def rule_based_stem(word):
  word = word.strip(string.punctuation) # Clean the word from punctuation.
  # Check and remove common suffixes if the word (without the suffix) remains longer than 2 characters
  for suffix in ["ing", "ed", "ly", "s", "es"]:
      if word.endswith(suffix) and len(word) - len(suffix) > 2:
        return word[:-len(suffix)]
  return word


In [16]:
# Define a minimal custom Lovins stemmer by removing a lsit of common sufixes.
lovins_suffixes = sorted(["ization", "ational", "fulness", "iveness", "ousness", "tional", "ing", "ed", "ly", "s"], key = len, reverse = True)
def lovins_stem(word):
    word = word.lower().strip(string.punctuation) # Convert to lowercase and remove punctuations
    for suffix in lovins_suffixes:
        if word.endswith(suffix) and len(word)-len(suffix) > 2:
            return word[:-len(suffix)]
    return word

In [17]:
# Initialize the NLTK stemmers.
porter = PorterStemmer() # For Porter stemming
snowball = SnowballStemmer('english') # For Snowball stemming
lancaster = LancasterStemmer() # For Lancaster stemming

In [18]:
# Process and print the first two reviews for each stemming method.
# For each review, tokenize the text, apply the coresponding stemmer to every token, and print the results.

In [20]:
# Rule Based Stemming
print("Rule Based Stemming Output for data:")
for index, sentence in df['sentence'].head(5).items():
    tokens = word_tokenize(sentence) # Tokenize the sentence into words
    stemmed_tokens = [rule_based_stem(token) for token in tokens] # Apply custom rule-based stemmer
    print(f"Original: {sentence}")
    print(f"Stemmed: {' '.join(stemmed_tokens)}\n") # Join tokens back to a string for display


Rule Based Stemming Output for data:


NameError: name 'string' is not defined