# Implementation and analysis of **BoW, TF-IDF** from scratch

In [None]:
import numpy as np

In [None]:
corpus = ['i like that the software can support multiple user and groups',
          'adding people to the groups was easy',
          'and there is an easy scheduling option',
          'had some issues had to work it out via chat which was annoying',
          'the best feature is the new text to speech',
          'the customer service is insane',
          'they will respond even on sundays']

## BoW

In [None]:
## Build a vocabulary from the given corpus
def build_vocab(corpus):
  vocab = set()
  for doc in corpus:
    for word in doc.split():
      vocab.add(word)
  return vocab

## Return the count of each word in given document
def bow(doc, vocab):
  _dict = {key:0 for key in vocab}
  for term in vocab:
    for word in doc.split():
      if word == term:
        _dict[term]+=1
  
  vec = []
  for _, count in _dict.items():
    vec.append(float(count))

  return vec

## Compute distance between two vectors
def vectors_distance(vec1, vec2):
  _sum = 0
  n = len(vec1)
  for i in range(n):
    _sum += (vec1[i] - vec2[i])**2
  return np.sqrt(_sum)

In [None]:
## Build vocab from corpus
vocab = build_vocab(corpus)

## BoW for a given doc
doc = 'the best feature is the new text to speech'
print(f"Vector form of '{doc}'\n {bow(doc, vocab)}")


Vector form of 'the best feature is the new text to speech'
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [None]:
## How similar two reviews are?
doc1 = 'they will respond even on sundays'
doc2 = 'he will respond even on weekends'
doc3 = 'had some issues had to work it out via chat which was annoying'

print(f"doc1, doc2 ==> {vectors_distance( bow(doc1, vocab), bow(doc2, vocab) )}")
print(f"doc2, doc3 ==> {vectors_distance( bow(doc2, vocab), bow(doc3, vocab) )}")
print(f"doc3, doc1 ==> {vectors_distance( bow(doc3, vocab), bow(doc1, vocab) )}")

doc1, doc2 ==> 1.4142135623730951
doc2, doc3 ==> 4.358898943540674
doc3, doc1 ==> 4.58257569495584


* Since the distance b/w bow(doc1) and bow(doc2) is small(i.e they are close) implies they are very much similar unlike other two combinations

## TF - IDF

In [None]:
## Build a vocabulary from the given corpus
def build_vocab(corpus):
  vocab = set()
  for doc in corpus:
    for word in doc.split():
      vocab.add(word)
  return vocab

def tf(word, doc):
  doc_len = len(doc.split())
  wc = 0
  for term in doc.split():
    if term == word:
      wc+=1
  
  return wc/doc_len

def idf(word, corpus):
  N = len(corpus)
  wc = 0
  for doc in corpus:
    if word in doc.split():
      wc+=1
  return np.log10(N/wc)

print(idf('the', corpus), idf('scheduling', corpus))
## output: 0.24303804868629442 0.8450980400142568
## The word 'the' is more frequent in the corpus, so the idf value is small 
## compared to the word 'scheduling' which is just occured only once


def tf_idf(doc, vocab):
  _dict = {key:0 for key in vocab}
  for term in vocab:
    _dict[term] = tf(term, doc) * idf(term, corpus)
  
  vec = []
  for _, tfidf in _dict.items():
    vec.append(round(tfidf, 2))

  return vec

0.24303804868629442 0.8450980400142568


## BoW v/s TF-IDF

In [None]:
corpus = [
          'the best offer mobile',
          'the best iphone ever',
          'this phone is the best',
          'great deal on electronics',
          'budget friendly it is the best'
]

In [None]:
vocab = build_vocab(corpus)
doc = 'the best mobile offer is the best'
print(vocab)
print(bow(doc, vocab))
print(tf_idf(doc, vocab))

{'friendly', 'mobile', 'phone', 'deal', 'best', 'electronics', 'is', 'it', 'ever', 'great', 'on', 'budget', 'iphone', 'offer', 'this', 'the'}
[0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0]
[0.0, 0.1, 0.0, 0.0, 0.03, 0.0, 0.06, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.03]


* For the word like 'the' 
    * bow --> 2.0 and tf-idf --> 0.03
    * this means, *`bow`* says 'the' is the very important feature,
    * whereas *`tf-idf`* says it's the poor feature.

* if-idf value '0.1'(largest tf-idf value) is for the words 'offer','mobile' 
* bow value '2.0' (largest bow value) is for the words 'the', 'best'
* implies, the review is talking about *`mobile and offer`* not about *`the`*