In [None]:
"""vsm.py implements a toy search engine to illustrate the vector
space model for documents.

It asks you to enter a search query, and then returns all documents
matching the query, in decreasing order of cosine similarity,
according to the vector space model."""

from collections import defaultdict
from functools import reduce
import math
import sys


# We use a corpus of four documents.  Each document has an id, and
# these are the keys in the following dict.  The values are the
# corresponding filenames.



# dictionary: a set to contain all terms (i.e., words) in the document
# corpus.
dictionary = set()

# postings: a defaultdict whose keys are terms, and whose
# corresponding values are the so-called "postings list" for that
# term, i.e., the list of documents the term appears in.
#
# The way we implement the postings list is actually not as a Python
# list.  Rather, it's as a dict whose keys are the document ids of
# documents that the term appears in, with corresponding values equal
# to the frequency with which the term occurs in the document.
#
# As a result, postings[term] is the postings list for term, and
# postings[term][id] is the frequency with which term appears in
# document id.


#Retreiving #########################################################################
import json
postings_normdoc = defaultdict(dict)
f = open('postings_normdoc.txt', 'r',encoding="utf-8")
postings_normdoc= json.loads(f.read())

document_id = []
document_title = []

f = open('document_id.txt', 'r',encoding="utf-8")
document_id = json.loads(f.read())

f = open('document_title.txt', 'r',encoding="utf-8")
document_title= json.loads(f.read())

document_filenames = defaultdict(dict)
f = open('document_filenames.txt', 'r',encoding="utf-8")
document_filenames= json.loads(f.read())

postings = defaultdict(dict)
f = open('postings.txt', 'r',encoding="utf-8")
postings= json.loads(f.read())

##########################################################################################

# The size of the corpus
N = len(document_filenames)

# document_frequency: a defaultdict whose keys are terms, with
# corresponding values equal to the number of documents which contain
# the key, i.e., the document frequency.
document_frequency = defaultdict(int)

# length: a defaultdict whose keys are document ids, with values equal
# to the Euclidean length of the corresponding document vector.
length = defaultdict(float)

# The list of characters (mostly, punctuation) we want to strip out of
# terms in the document.
characters = " .,!#$%^&*();:\n\t\\\"?!{}[]<>"

def main():
    initialize_terms_and_postings()
    initialize_document_frequencies()
    initialize_lengths()
    while True:
        do_search()

def initialize_terms_and_postings():
    """Reads in each document in document_filenames, splits it into a
    list of terms (i.e., tokenizes it), adds new terms to the global
    dictionary, and adds the document to the posting list for each
    term, with value equal to the frequency of the term in the
    document."""
    x = 0
    
    global dictionary, postings
    for id in document_filenames:
        document = document_filenames[id]
        terms = tokenize(document)
        unique_terms = set(terms)
        
        dictionary = dictionary.union(unique_terms)
        '''for term in unique_terms:
            postings[term][id] = terms.count(term)                      
            postings_tfwtdoc[term][id] = 1+math.log(terms.count(term),10)
            x = x + ((postings_tfwtdoc[term][id])**2)
        x = math.sqrt(x)
        
        for term in unique_terms:
            postings_normdoc[term][id] = postings_tfwtdoc[term][id]/x
        
    f = open("postings_normdoc.txt","w", encoding="utf-8")
    f.write(str(postings_normdoc))
    f.close() '''
            
def tokenize(document):
    """Returns a list whose elements are the separate terms in
    document.  Something of a hack, but for the simple documents we're
    using, it's okay.  Note that we case-fold when we tokenize, i.e.,
    we lowercase everything."""
    terms = document.lower().split()
    return [term.strip(characters) for term in terms]

def initialize_document_frequencies():
    """For each term in the dictionary, count the number of documents
    it appears in, and store the value in document_frequncy[term]."""
    global document_frequency
    for term in dictionary:
        document_frequency[term] = len(postings[term])

def initialize_lengths():
    """Computes the length for each document."""
    global length
    for id in document_filenames:
        l = 0
        for term in dictionary:
            l += imp(term,id)**2        
        length[id] = math.sqrt(l)


def imp(term,id):
    """Returns the importance of term in document id.  If the term
    isn't in the document, then return 0."""
    if id in postings[term]:
        return postings[term][id]*inverse_document_frequency(term)
    else:
        return 0.0

def inverse_document_frequency(term):
    """Returns the inverse document frequency of term.  Note that if
    term isn't in the dictionary then it returns 0, by convention."""
    if term in dictionary:
        return math.log(N/document_frequency[term],2)
    else:
        return 0.0

def do_search():
    """Asks the user what they would like to search for, and returns a
    list of relevant documents, in decreasing order of cosine
    similarity."""
    query = tokenize(input("Search query >> "))
    if query == []:
        sys.exit()
    # find document ids containing all query terms.  Works by
    # intersecting the posting lists for all query terms.
    relevant_document_ids = intersection(
            [set(postings[term].keys()) for term in query])
    if not relevant_document_ids:
        print("No documents matched all query terms.")
    else:
        scores = sorted([(id,similarity(query,id))
                         for id in relevant_document_ids],
                        key=lambda x: x[1],
                        reverse=True)
        print("Score: filename")
        for (id,score) in scores[0:10]:
            print ("Ltc.lnc score:"+str(score)+" Doc_id:"+document_id[int(id)]+" Title:"+document_title[int(id)])
            
def intersection(sets):
    """Returns the intersection of all sets in the list sets. Requires
    that the list sets contains at least one element, otherwise it
    raises an error."""
    return reduce(set.intersection, [s for s in sets])

def similarity(query,id):
    """Returns the cosine similarity between query and document id.
    Note that we don't bother dividing by the length of the query
    vector, since this doesn't make any difference to the ordering of
    search results."""
    similarity = 0.0
    sum = 0
    score = 0
    for term in query:
        if term in dictionary:
            
            product = postings_normdoc[term][id]
            #sum = product + sum
            p = inverse_document_frequency(term)*imp(term,id)
            sum = sum + p**2
            norm = p/math.sqrt(sum)
            score = score + product*norm            
            
            similarity += inverse_document_frequency(term)*imp(term,id)
    
    similarity = similarity / length[id]
    #print(score,similarity)
    return score

if __name__ == "__main__":
    main()


Search query >> hey
Score: filename
Ltc.lnc score:0.023059538305076375 Doc_id:798 Title:Aries (constellation)
Ltc.lnc score:0.014379033357917384 Doc_id:880 Title:ABBA
Search query >> this
Score: filename
Ltc.lnc score:0.159352395890582 Doc_id:1332 Title:August 7
Ltc.lnc score:0.13526729247657066 Doc_id:1110 Title:Demographics of American Samoa
Ltc.lnc score:0.1323765622184754 Doc_id:994 Title:Arecales
Ltc.lnc score:0.10853661145627949 Doc_id:1158 Title:Algebraic number
Ltc.lnc score:0.10399699746209526 Doc_id:1392 Title:Dasyproctidae
Ltc.lnc score:0.09895435240736543 Doc_id:675 Title:Affirming the consequent
Ltc.lnc score:0.09789255946551233 Doc_id:966 Title:American shot
Ltc.lnc score:0.09587478630628096 Doc_id:788 Title:Apiales
Ltc.lnc score:0.09479893125876564 Doc_id:1262 Title:Argot
Ltc.lnc score:0.09184241996377372 Doc_id:779 Title:Anthophyta
Search query >> Anarchism
Score: filename
Ltc.lnc score:0.04560908291467853 Doc_id:12 Title:Anarchism
Ltc.lnc score:0.03524632696093616 Doc_