In [2]:
import gensim
from gensim.utils import simple_preprocess
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import download
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK data
download('punkt')
download('stopwords')
download('wordnet')

# Initialize NLTK tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Sample text (since we don't have the file)
text = """
Natural language processing (NLP) is a subfield of artificial intelligence concerned with the interactions between computers and human (natural) languages.
It involves tasks such as language translation, sentiment analysis, and speech recognition.
"""

# Tokenization
tokens = word_tokenize(text)

# Lowercase and remove punctuation
tokens = [word.lower() for word in tokens if word not in string.punctuation]

# Remove stopwords
tokens = [word for word in tokens if word not in stop_words]

# Stemming
stemmed_tokens = [stemmer.stem(word) for word in tokens]

# Lemmatization
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

# Using Gensim's simple_preprocess function to preprocess the text (lowercasing, tokenization)
gensim_tokens = simple_preprocess(text)

# Display results
print("Original Text:")
print(text)
print("\nTokens after Tokenization:")
print(tokens[:20])  # Display first 20 tokens
print("\nStemmed Tokens:")
print(stemmed_tokens[:20])  # Display first 20 stemmed tokens
print("\nLemmatized Tokens:")
print(lemmatized_tokens[:20])  # Display first 20 lemmatized tokens
print("\nTokens using Gensim Simple Preprocess:")
print(gensim_tokens[:20])  # Display first 20 tokens from Gensim


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Text:

Natural language processing (NLP) is a subfield of artificial intelligence concerned with the interactions between computers and human (natural) languages.
It involves tasks such as language translation, sentiment analysis, and speech recognition.


Tokens after Tokenization:
['natural', 'language', 'processing', 'nlp', 'subfield', 'artificial', 'intelligence', 'concerned', 'interactions', 'computers', 'human', 'natural', 'languages', 'involves', 'tasks', 'language', 'translation', 'sentiment', 'analysis', 'speech']

Stemmed Tokens:
['natur', 'languag', 'process', 'nlp', 'subfield', 'artifici', 'intellig', 'concern', 'interact', 'comput', 'human', 'natur', 'languag', 'involv', 'task', 'languag', 'translat', 'sentiment', 'analysi', 'speech']

Lemmatized Tokens:
['natural', 'language', 'processing', 'nlp', 'subfield', 'artificial', 'intelligence', 'concerned', 'interaction', 'computer', 'human', 'natural', 'language', 'involves', 'task', 'language', 'translation', 'sentim