In [1]:
import nltk #nltk stands for natural language toolkit used for natural language processing tasks
from nltk.tokenize import word_tokenize
import spacy #Spacy is a natural language processing (NLP) library for Python. It is used to build information extraction or natural language understanding systems, or to pre-process text for deep learning.

In [2]:
nltk.download('punkt') # punkt is a pre-trained tokenizer, a data-driven sentence tokenizer that comes with NLTK. It is trained on large corpus of text to identify sentence boundaries.
nlp = spacy.load('en_core_web_sm') #en_core_web_sm is a small English pipeline trained on the CoNLL 2000 shared task.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
text = "Natural Language Processing is fascinating!"
tokens_nltk = word_tokenize(text) # word_tokenize is a function that splits a string into a list of words.
tokens_spacy = [token.text for token in nlp(text)] # [token.text for token in nlp(text)] is a list comprehension that creates a list of the text of each token in the SpaCy object.

In [4]:
print("NLTK tokens:", tokens_nltk)
print("spaCy tokens:", tokens_spacy)

NLTK tokens: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '!']
spaCy tokens: ['Natural', 'Language', 'Processing', 'is', 'fascinating', '!']


In [5]:
from nltk.corpus import stopwords # stopwords is a set of commonly used words in English that are usually not useful for text analysis., Corpus is a collection of text documents.

In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) # stopwords.words('english') returns a list of stopwords in English. and set is used to convert the list to a set.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
filtered_tokens = [token for token in tokens_nltk if token.lower() not in
stop_words] # [token for token in tokens_nltk if token.lower() not in stop_words] is a list comprehension that creates a list of tokens that are not stopwords.

In [8]:
print("Filtered tokens:", filtered_tokens)

Filtered tokens: ['Natural', 'Language', 'Processing', 'fascinating', '!']


In [9]:
doc = nlp(text) # nlp(text) is a SpaCy object that represents the text as a collection of tokens.
pos_tags = [(token.text, token.pos_) for token in doc] # [(token.text, token.pos_) for token in doc] is a list comprehension that creates a list of tuples of the text and part-of-speech tag of each token in the SpaCy object.

In [10]:
print("POS tags:", pos_tags)

POS tags: [('Natural', 'PROPN'), ('Language', 'PROPN'), ('Processing', 'PROPN'), ('is', 'AUX'), ('fascinating', 'ADJ'), ('!', 'PUNCT')]


# End