In [16]:
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk import pos_tag

In [17]:
def preprocess_text(corpus):
    corpus = [re.sub(r'https?://\S+|www\.\S+', '', doc) for doc in corpus]
    print("Removing URLs: ", corpus, "\n")

    # Removing special characters
    corpus = [re.sub(r'[^a-zA-Z\s]', '', doc) for doc in corpus]
    print("Removing special characters: ", corpus, "\n")

    # Tokenization
    tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]
    print("Tokenized text: ", tokenized_corpus, "\n")

    # Checking pos tags for each word
    pos_tagged_corpus = [pos_tag(doc) for doc in tokenized_corpus]
    print("POS tagging: ", pos_tagged_corpus, "\n")

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokenized_corpus = [[word for word in doc if word not in stop_words] for doc in tokenized_corpus]
    print("After stop word removal: ", tokenized_corpus, "\n")

    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_lemmatized_corpus = [[lemmatizer.lemmatize(stemmer.stem(word)) for word in doc] for doc in tokenized_corpus]

    return stemmed_lemmatized_corpus

In [18]:
corpus = [
    "Hello World! üåç Welcome to NLP. Visit https://example.com for more info.",
    "Python is amazing!!! @user #MachineLearning",
    "Email me at test@example.com üòä or visit www.website.org.",
    "Numbers 123456 and special characters $#@!* should be removed.",
    "Follow the updates at https://newsportal.com/latest-news and stay informed!",
    "Breaking news: AI is revolutionizing industries! Read more at www.technews.com.",
    "Contact support at support@helpdesk.com if you need assistance! üì©"
]

processed_corpus = preprocess_text(corpus)
print("Stemmed and Lemmatized Text:", processed_corpus)

Removing URLs:  ['Hello World! üåç Welcome to NLP. Visit  for more info.', 'Python is amazing!!! @user #MachineLearning', 'Email me at test@example.com üòä or visit ', 'Numbers 123456 and special characters $#@!* should be removed.', 'Follow the updates at  and stay informed!', 'Breaking news: AI is revolutionizing industries! Read more at ', 'Contact support at support@helpdesk.com if you need assistance! üì©'] 

Removing special characters:  ['Hello World  Welcome to NLP Visit  for more info', 'Python is amazing user MachineLearning', 'Email me at testexamplecom  or visit ', 'Numbers  and special characters  should be removed', 'Follow the updates at  and stay informed', 'Breaking news AI is revolutionizing industries Read more at ', 'Contact support at supporthelpdeskcom if you need assistance '] 

Tokenized text:  [['hello', 'world', 'welcome', 'to', 'nlp', 'visit', 'for', 'more', 'info'], ['python', 'is', 'amazing', 'user', 'machinelearning'], ['email', 'me', 'at', 'testexample

In [19]:
# Flatten the list of lists
all_words = [word for doc in processed_corpus for word in doc]

# Vocabulary and counts
vocab = Counter(all_words)

# Number of words
num_words = len(all_words)

# Number of documents
num_documents = len(processed_corpus)

print("Vocabulary:", vocab)
print("Number of words:", num_words)
print("Number of documents:", num_documents)


Vocabulary: Counter({'visit': 2, 'hello': 1, 'world': 1, 'welcom': 1, 'nlp': 1, 'info': 1, 'python': 1, 'amaz': 1, 'user': 1, 'machinelearn': 1, 'email': 1, 'testexamplecom': 1, 'number': 1, 'special': 1, 'charact': 1, 'remov': 1, 'follow': 1, 'updat': 1, 'stay': 1, 'inform': 1, 'break': 1, 'news': 1, 'ai': 1, 'revolution': 1, 'industri': 1, 'read': 1, 'contact': 1, 'support': 1, 'supporthelpdeskcom': 1, 'need': 1, 'assist': 1})
Number of words: 32
Number of documents: 7
