### Tokenize

In [1]:
from nltk import word_tokenize, download
download(['punkt', 'averaged_perceptron_tagger', 'stopwords'])

def get_tokens(sentence):
    words = word_tokenize(sentence)
    return words

sentence = "This is an example sentence."
print(get_tokens(sentence))


['This', 'is', 'an', 'example', 'sentence', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sophiasarica/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sophiasarica/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sophiasarica/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### POS tagging

In [4]:
from nltk import word_tokenize, pos_tag

# Define the function for tokenizing sentences
def get_tokens(sentence):
    words = word_tokenize(sentence)
    return words

# Define the function for getting part-of-speech tags
def get_pos(words):
    return pos_tag(words)

# Define a sentence to process
sentence = "This is an example sentence."

# Tokenize the sentence
words = get_tokens(sentence)

# Apply POS tagging
pos_tags = get_pos(words)

# Print the part-of-speech tags
print(pos_tags)


[('This', 'DT'), ('is', 'VBZ'), ('an', 'DT'), ('example', 'NN'), ('sentence', 'NN'), ('.', '.')]


### Remove Stopwords

In [5]:
from nltk import download
download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
import pprint

# Initialize stopwords and pretty printer
stop_words = stopwords.words('english')
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(stop_words)

# Define a sentence to process
sentence = "This is an example sentence."

# Tokenize the sentence
sentence_words = word_tokenize(sentence)
print(sentence_words)

# Define the function to remove stopwords
def remove_stop_words(words, stop_words):
    return ' '.join([word for word in words if word.lower() not in stop_words])

# Remove stopwords and print the result
filtered_sentence = remove_stop_words(sentence_words, stop_words)
print(filtered_sentence)

# Extend the stopwords list
stop_words.extend(['as'])

# Remove stopwords again with the extended list and print the result
filtered_sentence_extended = remove_stop_words(sentence_words, stop_words)
print(filtered_sentence_extended)


[   'i',
    'me',
    'my',
    'myself',
    'we',
    'our',
    'ours',
    'ourselves',
    'you',
    "you're",
    "you've",
    "you'll",
    "you'd",
    'your',
    'yours',
    'yourself',
    'yourselves',
    'he',
    'him',
    'his',
    'himself',
    'she',
    "she's",
    'her',
    'hers',
    'herself',
    'it',
    "it's",
    'its',
    'itself',
    'they',
    'them',
    'their',
    'theirs',
    'themselves',
    'what',
    'which',
    'who',
    'whom',
    'this',
    'that',
    "that'll",
    'these',
    'those',
    'am',
    'is',
    'are',
    'was',
    'were',
    'be',
    'been',
    'being',
    'have',
    'has',
    'had',
    'having',
    'do',
    'does',
    'did',
    'doing',
    'a',
    'an',
    'the',
    'and',
    'but',
    'if',
    'or',
    'because',
    'as',
    'until',
    'while',
    'of',
    'at',
    'by',
    'for',
    'with',
    'about',
    'against',
    'between',
    'into',
    'through',
    'during',
 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sophiasarica/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
