In [2]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp39-cp39-win_amd64.whl (24.0 MB)
Installing collected packages: gensim
Successfully installed gensim-4.3.3


In [7]:
import gensim
import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, strip_punctuation, strip_numeric, stem_text
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import spacy
import nltk

# Download NLTK tokenizer if not already downloaded
nltk.download('punkt')

# Load SpaCy English model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Load sample text from a file
def load_text(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

# Tokenization using Gensim
def tokenize_text(text):
    return simple_preprocess(text)

# Stemming using NLTK's Porter Stemmer
def stem_text_nltk(text):
    ps = PorterStemmer()
    tokens = word_tokenize(text)
    return " ".join([ps.stem(word) for word in tokens])

# Lemmatization using SpaCy
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# Full preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = remove_stopwords(text)  # Remove stopwords
    text = text = " ".join(preprocess_string(text, filters=[strip_punctuation, strip_numeric]))

    
    tokens = tokenize_text(text)  # Tokenization
    stemmed_text = stem_text_nltk(" ".join(tokens))  # Stemming
    lemmatized_text = lemmatize_text(" ".join(tokens))  # Lemmatization
    
    return {
        "tokenized": tokens,
        "stemmed": stemmed_text,
        "lemmatized": lemmatized_text
    }

# Example usage
file_path = r"C:\Users\K HARSHINI DEVI\OneDrive\Desktop\sample.txt"  # Replace with your actual text file
text_data = load_text(file_path)
processed_data = preprocess_text(text_data)

print("Tokenized Text:", processed_data["tokenized"])
print("Stemmed Text:", processed_data["stemmed"])
print("Lemmatized Text:", processed_data["lemmatized"])


[nltk_data] Downloading package punkt to C:\Users\K HARSHINI
[nltk_data]     DEVI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tokenized Text: ['sample', 'file', 'love', 'python', 'programming']
Stemmed Text: sampl file love python program
Lemmatized Text: sample file love python programming
