<a href="https://colab.research.google.com/github/prajaktakini/Language-Models/blob/main/bag_of_words.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
Bag Of Words Implementation
References:
1. Medium Blog: https://medium.com/free-code-camp/an-introduction-to-bag-of-words-and-how-to-code-it-in-python-for-nlp-282e87a9da04
```



In [19]:
# Download stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [20]:
# All imports
import string
import numpy
from typing import List
import nltk
from nltk.corpus import stopwords

In [21]:
# Returns corpus of tokens for a given text
def tokenize_text(text: list) -> List[str]:
  words = []
  for sentence in text:
    tokenized_sentence = generate_tokens(sentence)
    words.extend(tokenized_sentence)

  # Sort all words in ASC order
  words = sorted(list(set(words)))
  return words


In [22]:
# For a given sentence, generate tokens post pre-processing (Removes punctuation, stopwords, lowercases the words, etc)
def generate_tokens(sentence: str) -> List[str]:
  words = sentence.split()
  stopwords_set = set(stopwords.words('english'))
  tokens = [w.lower() for w in words if w not in string.punctuation and w.lower() for w in words if w not in stopwords_set]
  return tokens


In [27]:
# Generates bag of words for given text
def generate_bag_of_words(text: list):
  words_corpus = tokenize_text(text)
  print(f'Vocabulary \n {words_corpus} \n')

  for sentence in text:
    tokens = generate_tokens(sentence)

    # Initialises BOW vector for a given sentence
    encoded_vector = numpy.zeros(len(words_corpus))

    for word in tokens:
      for i, w in enumerate(words_corpus):
        if word == w:
          encoded_vector[i] += 1
    print("{0} \n{1}\n".format(sentence, numpy.array(encoded_vector)))

In [29]:
text = ["Various boroughs have made forward steps in the introduction of the cinematograph in the school", "The Birmingham Juvenile Organization Committee has prepared an exceedingly readable and interesting report for presentation to the Birmingham Education Committee", "To expand upon this theory, an open exhibition is to be arranged and will be attended by thousands of children from the senior departments of the schools, the younger element being excluded",
            "Teachers and officials of the Local Education Authority will lend their support.", "A synopsis of prepared notes was given to the teachers and scholars", "The schools are to be formed into groups, so that pupils may attend a special performance at a convenient centre at regular intervals", "The programme of exhibition is to last for one hour; the films selected coming under five headings"]

generate_bag_of_words(text)

Vocabulary 
 ['a', 'arranged', 'attend', 'attended', 'authority', 'birmingham', 'boroughs', 'centre', 'children', 'cinematograph', 'coming', 'committee', 'convenient', 'departments', 'education', 'element', 'exceedingly', 'excluded', 'exhibition', 'expand', 'films', 'five', 'formed', 'forward', 'given', 'groups,', 'headings', 'hour;', 'interesting', 'intervals', 'introduction', 'juvenile', 'last', 'lend', 'local', 'made', 'may', 'notes', 'officials', 'one', 'open', 'organization', 'performance', 'prepared', 'presentation', 'programme', 'pupils', 'readable', 'regular', 'report', 'scholars', 'school', 'schools', 'schools,', 'selected', 'senior', 'special', 'steps', 'support.', 'synopsis', 'teachers', 'the', 'theory,', 'thousands', 'to', 'upon', 'various', 'younger'] 

Various boroughs have made forward steps in the introduction of the cinematograph in the school 
[ 0.  0.  0.  0.  0.  0. 15.  0.  0. 15.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0. 15.  0.  0.  0.  0.  0.  0. 15. 