# Term-Document Matrix
Sparse representation of the occurence of terms in a document.

Used in Latent Semantic Analysis (LSA)

In [1]:
# Collection of documents (corpus)

review_1 = "The Glider II is a great soccer ball"
review_2 = "What a bad soccer ball"
review_3 = "I am happy with The glider"

In [2]:
docs = [review_1, review_2, review_3]
docs

['The Glider II is a great soccer ball',
 'What a bad soccer ball',
 'I am happy with The glider']

In [3]:
# Gather the set of all unique terms

unique_terms = {term for doc in docs for term in doc.split()}
unique_terms

{'Glider',
 'I',
 'II',
 'The',
 'What',
 'a',
 'am',
 'bad',
 'ball',
 'glider',
 'great',
 'happy',
 'is',
 'soccer',
 'with'}

In [4]:
# Construct a term-document matrix
# here as a Python dictionary for ease of interpretability

doc_term_matrix = {}

for term in unique_terms:
    doc_term_matrix[term] = []
    
    for doc in docs:
        if term in doc:
            doc_term_matrix[term].append(1)
        else:
             doc_term_matrix[term].append(0)

doc_term_matrix

{'is': [1, 0, 0],
 'glider': [0, 0, 1],
 'I': [1, 0, 1],
 'ball': [1, 1, 0],
 'am': [0, 0, 1],
 'The': [1, 0, 1],
 'II': [1, 0, 0],
 'Glider': [1, 0, 0],
 'bad': [0, 1, 0],
 'with': [0, 0, 1],
 'happy': [0, 0, 1],
 'great': [1, 0, 0],
 'What': [0, 1, 0],
 'a': [1, 1, 1],
 'soccer': [1, 1, 0]}

In [5]:
# The query to find all documents containing "Glider" AND "soccer"
# is just a bitwise AND:

import numpy as np

docs_array = np.array(docs, dtype='object')

v1 = np.array(doc_term_matrix['Glider'])
v2 = np.array(doc_term_matrix['soccer'])

print(v1)
print(v2)
print('-------')
v3 = v1 & v2
print(v3)

[1 0 0]
[1 1 0]
-------
[1 0 0]


In [6]:
# We can now get the matching documents from our corpus with the result
[doc for doc in v3 * docs_array if doc]

['The Glider II is a great soccer ball']

In [7]:
v1 = np.array(doc_term_matrix['a'])    
v2 = np.array(doc_term_matrix['ball'])

print(v1)
print(v2)
print('-------')
v3 = v1 & v2
print(v3)

[1 1 1]
[1 1 0]
-------
[1 1 0]


In [8]:
[doc for doc in v3 * docs_array if doc]

['The Glider II is a great soccer ball', 'What a bad soccer ball']

In [9]:
# Bitwise OR to construct 'this' or 'that' queries.

v1 = np.array(doc_term_matrix['great'])    
v2 = np.array(doc_term_matrix['bad'])

print(v1)
print(v2)
print('-------')
v3 = v1 | v2
print(v3)

[1 0 0]
[0 1 0]
-------
[1 1 0]


In [10]:
[doc for doc in v3 * docs_array if doc]

['The Glider II is a great soccer ball', 'What a bad soccer ball']