In [1]:
import nltk
import pandas as pd

# Bag of words

In [2]:
sentences = """
Regularly and thoroughly clean your hands with an alcohol-based hand rub or wash them with soap and water. 
This eliminates germs including viruses that may be on your hands.
Avoid touching your eyes, nose and mouth. 
Hands touch many surfaces and can pick up viruses. 
Once contaminated, hands can transfer the virus to your eyes, nose or mouth. 
From there, the virus can enter your body and infect you.
Cover your mouth and nose with your bent elbow or tissue when you cough or sneeze. 
Then dispose of the used tissue immediately into a closed bin and wash your hands.
By following good ‘respiratory hygiene’, you protect the people around you from viruses, which cause colds, flu and COVID-19.
Clean and disinfect surfaces frequently especially those which are regularly touched, 
such as door handles, faucets and phone screens or mouth. 
From there, the virus can enter your body and infect you.
"""

In [3]:
tokenized_sent = nltk.sent_tokenize(sentences)

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

### Let's try vectorizer.fit first!

In [5]:
vectorizer = CountVectorizer()
X = vectorizer.fit(tokenized_sent)
print(X.vocabulary_)

{'regularly': 58, 'and': 3, 'thoroughly': 72, 'clean': 16, 'your': 90, 'hands': 40, 'with': 88, 'an': 2, 'alcohol': 1, 'based': 8, 'hand': 38, 'rub': 60, 'or': 53, 'wash': 84, 'them': 68, 'soap': 63, 'water': 85, 'this': 71, 'eliminates': 27, 'germs': 36, 'including': 43, 'viruses': 83, 'that': 66, 'may': 47, 'be': 9, 'on': 51, 'avoid': 7, 'touching': 78, 'eyes': 30, 'nose': 49, 'mouth': 48, 'touch': 76, 'many': 46, 'surfaces': 65, 'can': 14, 'pick': 56, 'up': 80, 'once': 52, 'contaminated': 19, 'transfer': 79, 'the': 67, 'virus': 82, 'to': 75, 'from': 35, 'there': 70, 'enter': 28, 'body': 12, 'infect': 44, 'you': 89, 'cover': 21, 'bent': 10, 'elbow': 26, 'tissue': 74, 'when': 86, 'cough': 20, 'sneeze': 62, 'then': 69, 'dispose': 24, 'of': 50, 'used': 81, 'immediately': 42, 'into': 45, 'closed': 17, 'bin': 11, 'by': 13, 'following': 33, 'good': 37, 'respiratory': 59, 'hygiene': 41, 'protect': 57, 'people': 54, 'around': 5, 'which': 87, 'cause': 15, 'colds': 18, 'flu': 32, 'covid': 22, 

In [6]:
vectorizer = CountVectorizer(lowercase=False)
X = vectorizer.fit(tokenized_sent)
print(X.vocabulary_)

{'Regularly': 9, 'and': 14, 'thoroughly': 76, 'clean': 25, 'your': 94, 'hands': 47, 'with': 92, 'an': 13, 'alcohol': 12, 'based': 18, 'hand': 45, 'rub': 66, 'or': 59, 'wash': 88, 'them': 74, 'soap': 69, 'water': 89, 'This': 11, 'eliminates': 34, 'germs': 43, 'including': 50, 'viruses': 87, 'that': 72, 'may': 54, 'be': 19, 'on': 58, 'Avoid': 1, 'touching': 82, 'eyes': 37, 'nose': 56, 'mouth': 55, 'Hands': 7, 'touch': 80, 'many': 53, 'surfaces': 71, 'can': 23, 'pick': 62, 'up': 84, 'Once': 8, 'contaminated': 28, 'transfer': 83, 'the': 73, 'virus': 86, 'to': 79, 'From': 6, 'there': 75, 'enter': 35, 'body': 22, 'infect': 51, 'you': 93, 'Cover': 5, 'bent': 20, 'elbow': 33, 'tissue': 78, 'when': 90, 'cough': 29, 'sneeze': 68, 'Then': 10, 'dispose': 31, 'of': 57, 'used': 85, 'immediately': 49, 'into': 52, 'closed': 26, 'bin': 21, 'By': 2, 'following': 40, 'good': 44, 'respiratory': 65, 'hygiene': 48, 'protect': 63, 'people': 60, 'around': 16, 'from': 42, 'which': 91, 'cause': 24, 'colds': 27,

In [7]:
vectorizer = CountVectorizer(stop_words='english')
x = vectorizer.fit(tokenized_sent)
print(x.vocabulary_)

{'regularly': 42, 'thoroughly': 49, 'clean': 8, 'hands': 31, 'alcohol': 1, 'based': 3, 'hand': 29, 'rub': 44, 'wash': 58, 'soap': 47, 'water': 59, 'eliminates': 19, 'germs': 27, 'including': 34, 'viruses': 57, 'avoid': 2, 'touching': 53, 'eyes': 22, 'nose': 37, 'mouth': 36, 'touch': 51, 'surfaces': 48, 'pick': 40, 'contaminated': 11, 'transfer': 54, 'virus': 56, 'enter': 20, 'body': 6, 'infect': 35, 'cover': 13, 'bent': 4, 'elbow': 18, 'tissue': 50, 'cough': 12, 'sneeze': 46, 'dispose': 16, 'used': 55, 'immediately': 33, 'closed': 9, 'bin': 5, 'following': 25, 'good': 28, 'respiratory': 43, 'hygiene': 32, 'protect': 41, 'people': 38, 'cause': 7, 'colds': 10, 'flu': 24, 'covid': 14, '19': 0, 'disinfect': 15, 'frequently': 26, 'especially': 21, 'touched': 52, 'door': 17, 'handles': 30, 'faucets': 23, 'phone': 39, 'screens': 45}


In [8]:
print(vectorizer.get_stop_words())  # to see the stopwords of the library
x.vocabulary_.get('clean')

frozenset({'few', 'amoungst', 'because', 'either', 'whereby', 'yours', 'upon', 'system', 'be', 'will', 'mill', 'most', 'yourselves', 'two', 'though', 'up', 'rather', 'bill', 'everything', 'why', 'would', 'of', 'how', 'towards', 'ten', 'sincere', 'hereby', 'already', 'so', 'anyhow', 'nine', 'hereafter', 'mostly', 'herself', 'hasnt', 'see', 'any', 'every', 'next', 'moreover', 're', 'which', 'after', 'eleven', 'twenty', 'around', 'each', 'however', 'per', 'might', 'yet', 'latterly', 'when', 'your', 'even', 'yourself', 'full', 'beyond', 'formerly', 'both', 'some', 'almost', 'had', 'off', 'hereupon', 'thereupon', 'front', 'or', 'fifty', 'never', 'out', 'part', 'besides', 'ever', 'now', 'eg', 'thereafter', 'anything', 'back', 'below', 'put', 'between', 'whereafter', 'behind', 'to', 'whenever', 'they', 'alone', 'made', 'once', 'often', 'hers', 'his', 'cannot', 'latter', 'still', 'fire', 'for', 'was', 'ie', 'along', 'meanwhile', 'at', 'via', 'its', 'itself', 'more', 'many', 'give', 'not', 'whe

8

### Now let's try using fit_transform 

In [9]:
ft = vectorizer.fit_transform(tokenized_sent)
count_array = ft.toarray()
print(count_array)

[[0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
  0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
  1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0]


In [10]:
print(count_array.shape)  #There are 11 sentences, and there are 60 unique words in these sentences

(11, 60)


In [11]:
count_token = vectorizer.get_feature_names_out()
count_token

array(['19', 'alcohol', 'avoid', 'based', 'bent', 'bin', 'body', 'cause',
       'clean', 'closed', 'colds', 'contaminated', 'cough', 'cover',
       'covid', 'disinfect', 'dispose', 'door', 'elbow', 'eliminates',
       'enter', 'especially', 'eyes', 'faucets', 'flu', 'following',
       'frequently', 'germs', 'good', 'hand', 'handles', 'hands',
       'hygiene', 'immediately', 'including', 'infect', 'mouth', 'nose',
       'people', 'phone', 'pick', 'protect', 'regularly', 'respiratory',
       'rub', 'screens', 'sneeze', 'soap', 'surfaces', 'thoroughly',
       'tissue', 'touch', 'touched', 'touching', 'transfer', 'used',
       'virus', 'viruses', 'wash', 'water'], dtype=object)

In [12]:
df_countvect = pd.DataFrame(data=count_array, columns=count_token)
df_countvect

Unnamed: 0,19,alcohol,avoid,based,bent,bin,body,cause,clean,closed,...,tissue,touch,touched,touching,transfer,used,virus,viruses,wash,water
0,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
5,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,1,0,0,1,0
8,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


# TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [14]:
tf_results = tfidf.fit_transform(tokenized_sent)

In [15]:
tfidf_token = tfidf.get_feature_names_out()

In [16]:
df_tfidfvect = pd.DataFrame(data=tf_results.toarray(), columns=tfidf_token)
print(df_tfidfvect)

          19   alcohol     avoid     based      bent       bin      body  \
0   0.000000  0.323429  0.000000  0.323429  0.000000  0.000000  0.000000   
1   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2   0.000000  0.000000  0.516605  0.000000  0.000000  0.000000  0.000000   
3   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
5   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.514793   
6   0.000000  0.000000  0.000000  0.000000  0.384986  0.000000  0.000000   
7   0.000000  0.000000  0.000000  0.000000  0.000000  0.382666  0.000000   
8   0.294053  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
9   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
10  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.514793   

       cause     clean    closed  ...    tissue     touch   touched  touching  \
0   0.

In [17]:
word_count = pd.DataFrame({
    'words': tfidf_token,
    'tf-idf score': tf_results.sum(axis=0).flat
})
word_count

Unnamed: 0,words,tf-idf score
0,19,0.294053
1,alcohol,0.323429
2,avoid,0.516605
3,based,0.323429
4,bent,0.384986
5,bin,0.382666
6,body,1.029585
7,cause,0.294053
8,clean,0.526957
9,closed,0.382666


In [18]:
word_count.sort_values(by=['tf-idf score'], ascending=False)

Unnamed: 0,words,tf-idf score
31,hands,1.331257
56,virus,1.252948
36,mouth,1.113092
20,enter,1.029585
35,infect,1.029585
6,body,1.029585
37,nose,1.025226
57,viruses,0.992836
22,eyes,0.836694
48,surfaces,0.697079
