In [None]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample text for demonstration
sample_text = """
Natural language processing (NLP) is a field of artificial intelligence in which computers analyze, understand, and derive meaning from human language in a smart and useful way. By utilizing NLP, developers can organize and structure knowledge to perform tasks such as automatic summarization, translation, named entity recognition, relationship extraction, sentiment analysis, speech recognition, and topic segmentation.
"""

In [2]:
word_tokens = word_tokenize(sample_text)
word_tokens

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'field',
 'of',
 'artificial',
 'intelligence',
 'in',
 'which',
 'computers',
 'analyze',
 ',',
 'understand',
 ',',
 'and',
 'derive',
 'meaning',
 'from',
 'human',
 'language',
 'in',
 'a',
 'smart',
 'and',
 'useful',
 'way',
 '.',
 'By',
 'utilizing',
 'NLP',
 ',',
 'developers',
 'can',
 'organize',
 'and',
 'structure',
 'knowledge',
 'to',
 'perform',
 'tasks',
 'such',
 'as',
 'automatic',
 'summarization',
 ',',
 'translation',
 ',',
 'named',
 'entity',
 'recognition',
 ',',
 'relationship',
 'extraction',
 ',',
 'sentiment',
 'analysis',
 ',',
 'speech',
 'recognition',
 ',',
 'and',
 'topic',
 'segmentation',
 '.']

In [3]:
sent_tokens = sent_tokenize(sample_text)
sent_tokens

['\nNatural language processing (NLP) is a field of artificial intelligence in which computers analyze, understand, and derive meaning from human language in a smart and useful way.',
 'By utilizing NLP, developers can organize and structure knowledge to perform tasks such as automatic summarization, translation, named entity recognition, relationship extraction, sentiment analysis, speech recognition, and topic segmentation.']

In [4]:
stop_words = set(stopwords.words("english"))
useful_words = [word for word in word_tokens if word.lower() not in stop_words]
useful_words

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'field',
 'artificial',
 'intelligence',
 'computers',
 'analyze',
 ',',
 'understand',
 ',',
 'derive',
 'meaning',
 'human',
 'language',
 'smart',
 'useful',
 'way',
 '.',
 'utilizing',
 'NLP',
 ',',
 'developers',
 'organize',
 'structure',
 'knowledge',
 'perform',
 'tasks',
 'automatic',
 'summarization',
 ',',
 'translation',
 ',',
 'named',
 'entity',
 'recognition',
 ',',
 'relationship',
 'extraction',
 ',',
 'sentiment',
 'analysis',
 ',',
 'speech',
 'recognition',
 ',',
 'topic',
 'segmentation',
 '.']

In [5]:
punctuation_marks = {".", ",", "(", ")", "``", "''", "'", "!", "?", ":", ";", "-", "--"}
extended_stop_words = stop_words.union(punctuation_marks)
useful_words = [word for word in word_tokens if word.lower() not in extended_stop_words]
useful_words

['Natural',
 'language',
 'processing',
 'NLP',
 'field',
 'artificial',
 'intelligence',
 'computers',
 'analyze',
 'understand',
 'derive',
 'meaning',
 'human',
 'language',
 'smart',
 'useful',
 'way',
 'utilizing',
 'NLP',
 'developers',
 'organize',
 'structure',
 'knowledge',
 'perform',
 'tasks',
 'automatic',
 'summarization',
 'translation',
 'named',
 'entity',
 'recognition',
 'relationship',
 'extraction',
 'sentiment',
 'analysis',
 'speech',
 'recognition',
 'topic',
 'segmentation']

In [6]:
len(word_tokens), len(useful_words)

(69, 39)

In [7]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in useful_words]
print(" ".join(stemmed_words))

natur languag process nlp field artifici intellig comput analyz understand deriv mean human languag smart use way util nlp develop organ structur knowledg perform task automat summar translat name entiti recognit relationship extract sentiment analysi speech recognit topic segment


In [8]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in useful_words]
print(" ".join(lemmatized_words))

Natural language processing NLP field artificial intelligence computer analyze understand derive meaning human language smart useful way utilizing NLP developer organize structure knowledge perform task automatic summarization translation named entity recognition relationship extraction sentiment analysis speech recognition topic segmentation
