In [1]:
import re
import collections
import os
import time
import math

In [2]:
indToDoc = {}
postingList = collections.defaultdict(list)
curDocID = 0
path = "practice_collection"

In [3]:
def tokenize(f, path):
    #Returns [(term, pos), (term, pos) ...]
    terms = []
    with open(path + "/" + f) as file:
        line = file.readline()
        i = 0
        while line:
            # Regex to match only strings and spaces 
            line = re.sub(r'[^A-Za-z\s]+', '', line)
            for word in line.split():
                terms.append((word.lower(), i))
                i += 1
            line = file.readline() 
    return terms

In [4]:
def createOldPostingList(path):
    startTime = time.time()

    stopList = set()
    stopTuples = tokenize("stop-list.txt", ".")
    for t, p in stopTuples:
        stopList.add(t)

    for f in os.listdir(path):
        # TODO: remove in real code
        global curDocID
        
        indToDoc[curDocID] = f

        # Returns [(word, pos), (word, pos) ...]
        terms = tokenize(f, path)

        # create map {word:[pos1, pos2, pos3]}
        wordToPos = collections.defaultdict(list)
        for word, pos in terms:
            # if word in stopList: TODO remove in real code
            #     continue
            wordToPos[word].append(pos)

        # append to posting list {term : [(docID1, [pos1, pos2, pos3, pos4])]}
        for term, arr in wordToPos.items():
            postingList[term].append((curDocID, wordToPos[term]))

        # For every file update id 
        curDocID += 1
    
    endTime = time.time()
    print(f"Index built in {endTime - startTime} seconds.")
    return postingList

Create new posting list

In [5]:
idToTerm = {}
curTermIndex = 0
termToId = {}

In [6]:
def createNewPostringList():
    global curTermIndex # TODO REMOVE
    newPostingList = collections.defaultdict(list)
    result = createOldPostingList(path)
    for k,v in result.items():
        # if k in stopList:
        #     continue
        idToTerm[curTermIndex] = k
        newPostingList[curTermIndex].append(math.log10(len(indToDoc)/len(v)))
        
        for docId, posList in v:
            newPostingList[curTermIndex].append((docId, 1 + math.log10(len(posList)), posList))
        curTermIndex += 1
    for id, term in idToTerm.items():
        termToId[term] = id
    return newPostingList

In [7]:
newPostingList = createNewPostringList()

Index built in 0.0026581287384033203 seconds.


In [8]:
for k, v in newPostingList.items():
    print(f"{k} : {v}")
print(idToTerm)

0 : [0.17609125905568124, (0, 1.0, [0]), (1, 1.0, [0])]
1 : [0.17609125905568124, (0, 1.0, [1]), (1, 1.0, [1])]
2 : [0.47712125471966244, (0, 1.0, [2])]
3 : [0.17609125905568124, (1, 1.0, [2]), (2, 1.0, [2])]
4 : [0.47712125471966244, (2, 1.0, [0])]
5 : [0.47712125471966244, (2, 1.0, [1])]
{0: 'the', 1: 'dog', 2: 'barked', 3: 'jumped', 4: 'a', 5: 'cat'}


In [9]:
def convertDocumentsToVector():
    docToVec = {}
    for k in indToDoc.keys():
        docToVec[k] = len(idToTerm)*[float(0)]

    for k, v in newPostingList.items():
        idf = v[0]
        for i in range(1, len(v)):
            t = v[i]
            doc, w = t[0], t[1]
            docToVec[doc][k] = idf*w
    return docToVec

In [10]:
def cosine_similarity(v1,v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

In [19]:
docToVec = convertDocumentsToVector()
print(cosine_similarity(docToVec[0], docToVec[1]))

0.37780020399389935


In [32]:
print(idToTerm)
for k, v in docToVec.items():
    print(f"{k}: {v}")

{0: 'the', 1: 'dog', 2: 'barked', 3: 'jumped', 4: 'a', 5: 'cat'}
0: [0.17609125905568124, 0.17609125905568124, 0.47712125471966244, 0.0, 0.0, 0.0]
1: [0.17609125905568124, 0.17609125905568124, 0.0, 0.17609125905568124, 0.0, 0.0]
2: [0.0, 0.0, 0.0, 0.17609125905568124, 0.47712125471966244, 0.47712125471966244]


In [30]:
def exact_query(query_terms, k):
    query_terms = [word.lower() for word in query_terms]

    termToFreq = {}
    for t in query_terms:
        termToFreq[t] = termToFreq.get(t, 0) + 1

    queryVector = len(idToTerm)*[float(0)]
    for term, freq in termToFreq.items():
        if term in termToId:
            w = 1 + math.log10(freq)
            idf = newPostingList[termToId[term]][0]
            queryVector[termToId[term]] = w*idf

    docToSimilarity = []
    for doc, vec in docToVec.items():
        docToSimilarity.append((doc, cosine_similarity(queryVector, vec)))
    for d, s in docToSimilarity:
        print(f"doc: {indToDoc[d]}, similarity: {s}")
    docToSimilarity.sort(key = lambda x:x[1], reverse=True)
    return [indToDoc[t[0]] for t in docToSimilarity[:k]]

In [31]:
query_terms = ["a", "cat", "jumped"]
print(exact_query(query_terms, 2))

doc: d1.txt, similarity: 0.0
doc: d2.txt, similarity: 0.14578946632810494
doc: d3.txt, similarity: 1.0
['d3.txt', 'd2.txt']


In [50]:
championList = collections.defaultdict(list)
r = 2
for termId, postList in newPostingList.items():
    idf = postList[0]
    documents = postList[1:]
    documents.sort(key=lambda x:x[1])
    print(documents)
    championList[termId].append(idf)
    championList[termId].extend(documents[:r])

[(0, 1.0, [0]), (1, 1.0, [0])]
[(0, 1.0, [1]), (1, 1.0, [1])]
[(0, 1.0, [2])]
[(1, 1.0, [2]), (2, 1.0, [2])]
[(2, 1.0, [0])]
[(2, 1.0, [1])]


In [51]:
print(championList)

defaultdict(<class 'list'>, {0: [0.17609125905568124, (0, 1.0, [0]), (1, 1.0, [0])], 1: [0.17609125905568124, (0, 1.0, [1]), (1, 1.0, [1])], 2: [0.47712125471966244, (0, 1.0, [2])], 3: [0.17609125905568124, (1, 1.0, [2]), (2, 1.0, [2])], 4: [0.47712125471966244, (2, 1.0, [0])], 5: [0.47712125471966244, (2, 1.0, [1])]})
