### Bag of words model

Considers all the words in corpus as entities disregarding their position in a sentence, and hence, context.

In [1]:
import itertools
import numpy as np
import pandas as pd

In [2]:
#each sentence can be considered as document

corpus = ['king is a strong man', 
          'queen is a wise woman', 
          'boy is a young man',
          'girl is a young woman',
          'prince is a young king',
          'princess is a young queen',
          'man is strong', 
          'woman is pretty',
          'prince is a boy will be king',
          'princess is a girl will be queen']

### Boolean Retrieval

1. Checks whether a term is present in document or not. 
2. However, it does not care about the importance of words.

### Methods:

1. Term-document matrix
2. Posting lists

Local and discrete representation of words. 
- Local because it does not capture the context between words. 
- Discrete because the information about word is not distributed across the vector (like word embeddings).

### **Term - document matrix**

In [3]:
words = list(map(lambda x: x.split(), corpus))
vocab = list(set(itertools.chain(*words)))
vocab

['boy',
 'girl',
 'strong',
 'pretty',
 'be',
 'a',
 'king',
 'will',
 'prince',
 'princess',
 'wise',
 'queen',
 'woman',
 'is',
 'man',
 'young']

In [4]:
def check_occurence(word, document):
    
    document=document.split()
    
    if (word in document):
        return 1
    else:
        return 0

occurences = np.array(list(map(lambda y: list(map(lambda x: check_occurence(x, y), vocab)), corpus)))
df = pd.DataFrame(occurences.transpose(), index=vocab, columns=range(len(corpus))) 

# (term, document) matrix
df.shape

(16, 10)

In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
boy,0,0,1,0,0,0,0,0,1,0
girl,0,0,0,1,0,0,0,0,0,1
strong,1,0,0,0,0,0,1,0,0,0
pretty,0,0,0,0,0,0,0,1,0,0
be,0,0,0,0,0,0,0,0,1,1
a,1,1,1,1,1,1,0,0,1,1
king,1,0,0,0,1,0,0,0,1,0
will,0,0,0,0,0,0,0,0,1,1
prince,0,0,0,0,1,0,0,0,1,0
princess,0,0,0,0,0,1,0,0,0,1


In [6]:
#checking sparsity of matrix
sum(df==1) / (df.shape[0] * df.shape[1]) * 100

28.125

### Inverted Index

Since the term-document matrix can be very sparse, we can store the document_ids in lists. 

#### Postings lists

In [7]:
def posting_list(word, corpus):
    doc_ids = np.array(list(range(len(corpus))))
    occurence = np.array(list(map(lambda x: check_occurence(word, x), corpus)))

    idx = (occurence==1)*doc_ids
    return idx[idx!=0].tolist()

In [8]:
posting_dict={}

for word in vocab:
    posting_dict[word]=posting_list(word, corpus)
    
posting_dict

{'boy': [2, 8],
 'girl': [3, 9],
 'strong': [6],
 'pretty': [7],
 'be': [8, 9],
 'a': [1, 2, 3, 4, 5, 8, 9],
 'king': [4, 8],
 'will': [8, 9],
 'prince': [4, 8],
 'princess': [5, 9],
 'wise': [1],
 'queen': [1, 5, 9],
 'woman': [1, 3, 7],
 'is': [1, 2, 3, 4, 5, 6, 7, 8, 9],
 'man': [2, 6],
 'young': [2, 3, 4, 5]}

### Positional Index

As described before, **we are not considering the position of words** and so for **phrase queries** like **'Indian Statistical Institute'**, we need the words to appear in the exact order in a document to be relevant.

And so, we can keep the information of position of words (positional index) in document in the posting list.

In [9]:
def get_positional_index(word, doc_id, document):
    
    positions = (np.array(document)==word).nonzero()[0].tolist()
    
    return {doc_id: positions}
    
def get_posting_list(word, corpus):
    
    doc_ids = np.array(list(map(lambda x: check_occurence(word, x), corpus))).nonzero()[0].tolist()
    documents = list(map(lambda x: x.split(), corpus))
    
    return list(map(lambda x: get_positional_index(word, x, documents[x]), doc_ids))

In [10]:
word='man'
np.array(list(map(lambda x: check_occurence(word, x), corpus))).nonzero()[0].tolist()

[0, 2, 6]

In [11]:
from collections import ChainMap

posting_dict={}

for word in vocab:
    docs = get_posting_list(word, corpus)
    posting_dict[word] = dict(ChainMap(*docs))
    
posting_dict

{'boy': {8: [3], 2: [0]},
 'girl': {9: [3], 3: [0]},
 'strong': {6: [2], 0: [3]},
 'pretty': {7: [2]},
 'be': {9: [5], 8: [5]},
 'a': {9: [2], 8: [2], 5: [2], 4: [2], 3: [2], 2: [2], 1: [2], 0: [2]},
 'king': {8: [6], 4: [4], 0: [0]},
 'will': {9: [4], 8: [4]},
 'prince': {8: [0], 4: [0]},
 'princess': {9: [0], 5: [0]},
 'wise': {1: [3]},
 'queen': {9: [6], 5: [4], 1: [0]},
 'woman': {7: [0], 3: [4], 1: [4]},
 'is': {9: [1],
  8: [1],
  7: [1],
  6: [1],
  5: [1],
  4: [1],
  3: [1],
  2: [1],
  1: [1],
  0: [1]},
 'man': {6: [0], 2: [4], 0: [4]},
 'young': {5: [3], 4: [3], 3: [3], 2: [3]}}

### TF-iDF weighting

Captures how important words are relative to corpus.

Local and continuous representation of words. Local because it does not capture the context of words.

In [12]:
def calculate_term_freq(word, document):
    words = np.array(document.split())
    return sum(words==word) / len(words)
    
def calculate_inverse_doc_freq(word, corpus):
    n_occurence = sum(map(lambda x: check_occurence(word, x), corpus))
    
    if (n_occurence==0):
        n_occurence = 1
        
    return np.log(len(corpus) / n_occurence)

tf = np.array(list(map(lambda y: list(map(lambda x: calculate_term_freq(x, y), vocab)), corpus)))
idf = list(map(lambda x: calculate_inverse_doc_freq(x, corpus), vocab))
idf = np.array([idf]*len(corpus))

tf.shape, idf.shape

((10, 16), (10, 16))

In [13]:
df_tfidf = pd.DataFrame((tf*idf).transpose(), index=vocab, columns=range(len(corpus))).round(2)
df_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
boy,0.0,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.23,0.0
girl,0.0,0.0,0.0,0.32,0.0,0.0,0.0,0.0,0.0,0.23
strong,0.32,0.0,0.0,0.0,0.0,0.0,0.54,0.0,0.0,0.0
pretty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.77,0.0,0.0
be,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,0.23
a,0.04,0.04,0.04,0.04,0.04,0.04,0.0,0.0,0.03,0.03
king,0.24,0.0,0.0,0.0,0.24,0.0,0.0,0.0,0.17,0.0
will,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,0.23
prince,0.0,0.0,0.0,0.0,0.32,0.0,0.0,0.0,0.23,0.0
princess,0.0,0.0,0.0,0.0,0.0,0.32,0.0,0.0,0.0,0.23
