In [1]:
# Import the dependencies
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
# Initialize the stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bradleywise/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Define the sentences.
sentence_1 = "I want to invest for retirement."
sentence_2 = "Should I invest in mutual funds, or should I invest in stocks?"
sentence_3 = "I should schedule an appointment with a financial planner."

In [5]:
# Import regex
import re

# Create a regex pattern to remove punctuation. 
pattern = r'[^a-zA-Z\s ]'

# Create an empty list to hold the tokens.
tokens = []

# Remove punctuation, tokenize sentence 1, and add the tokens to the tokens list.
cleaned_sentence_1 = re.sub(pattern, '', sentence_1)
# print(cleaned_sentence_1)
token_sentence_1 = nltk.word_tokenize(cleaned_sentence_1.lower())
tokens.append(token_sentence_1)

# Remove punctuation, tokenize sentence 2, and add the tokens to the tokens list.
cleaned_sentence_2 = re.sub(pattern, '', sentence_2)
# print(cleaned_sentence_1)
token_sentence_2 = nltk.word_tokenize(cleaned_sentence_2.lower())
tokens.append(token_sentence_2)

# Remove punctuation, tokenize sentence 3, and add the tokens to the tokens list.
cleaned_sentence_3 = re.sub(pattern, '', sentence_3)
# print(cleaned_sentence_1)
token_sentence_3 = nltk.word_tokenize(cleaned_sentence_3.lower())
tokens.append(token_sentence_3)

# Display the tokens.
tokens

[['i', 'want', 'to', 'invest', 'for', 'retirement'],
 ['should',
  'i',
  'invest',
  'in',
  'mutual',
  'funds',
  'or',
  'should',
  'i',
  'invest',
  'in',
  'stocks'],
 ['i',
  'should',
  'schedule',
  'an',
  'appointment',
  'with',
  'a',
  'financial',
  'planner']]

In [6]:
# Remove stopwords
filtered_tokens = []

for token_words in tokens:
    filtered_token_words = [word for word in token_words if not word in stop_words]
    filtered_tokens.append(filtered_token_words)


# Display the filtered tokens.
filtered_tokens

[['want', 'invest', 'retirement'],
 ['invest', 'mutual', 'funds', 'invest', 'stocks'],
 ['schedule', 'appointment', 'financial', 'planner']]

In [9]:
# Create a dictionary that will be our bag-of-words.
bag_of_words = {}
for i in range(len(filtered_tokens)):
    for word in filtered_tokens[i]:
        if word not in bag_of_words:
            bag_of_words[word]=0
        bag_of_words[word]+=1

# Display the bag-of-words.
bag_of_words

{'want': 1,
 'invest': 3,
 'retirement': 1,
 'mutual': 1,
 'funds': 1,
 'stocks': 1,
 'schedule': 1,
 'appointment': 1,
 'financial': 1,
 'planner': 1}

### Use scikit-learn's `CountVectorizer` demonstrate how a BoW is created.

In [11]:
# Import the dependencies
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [13]:
# Create a CountVectorizer object
vectorizer = CountVectorizer(stop_words='english')

# Fit the vectorizer to the input sentences and transform them into a bag of words
sentences = [sentence_1, sentence_2, sentence_3]

dtm = vectorizer.fit_transform(sentences)

# Print the resulting bag of words
print(dtm.toarray())

[[0 0 0 1 0 0 1 0 0 1]
 [0 0 1 2 1 0 0 0 1 0]
 [1 1 0 0 0 1 0 1 0 0]]


In [15]:
vectorizer.get_feature_names_out()

array(['appointment', 'financial', 'funds', 'invest', 'mutual', 'planner',
       'retirement', 'schedule', 'stocks', 'want'], dtype=object)

In [18]:
# Create a DataFrame of the bag of words. 
df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,appointment,financial,funds,invest,mutual,planner,retirement,schedule,stocks,want
0,0,0,0,1,0,0,1,0,0,1
1,0,0,1,2,1,0,0,0,1,0
2,1,1,0,0,0,1,0,1,0,0


In [21]:
# Print the vocabulary. 
vectorizer.get_feature_names_out()

array(['appointment', 'financial', 'funds', 'invest', 'mutual', 'planner',
       'retirement', 'schedule', 'stocks', 'want'], dtype=object)

In [23]:
# Get the number of times each word appears in the vocabulary.
df.sum(axis=0)

appointment    1
financial      1
funds          1
invest         3
mutual         1
planner        1
retirement     1
schedule       1
stocks         1
want           1
dtype: int64