In [1]:
import re
import collections
import os
import time
import math

In [2]:
indToDoc = {}
postingList = collections.defaultdict(list)
curDocID = 0
path = "practice3_collection"

In [4]:
def tokenize(f, path):
    #Returns [(term, pos), (term, pos) ...]
    terms = []
    with open(path + "/" + f) as file:
        line = file.readline()
        i = 0
        while line:
            # Regex to match only strings and spaces 
            line = re.sub(r'[^A-Za-z\s]+', '', line)
            for word in line.split():
                terms.append((word.lower(), i))
                i += 1
            line = file.readline() 
    return terms

In [5]:
def createPostingList(path):
    startTime = time.time()
    for f in os.listdir(path):
        # TODO: remove in real code
        global curDocID
        
        indToDoc[curDocID] = f

        # Returns [(word, pos), (word, pos) ...]
        terms = tokenize(f, path)

        # create map {word:[pos1, pos2, pos3]}
        wordToPos = collections.defaultdict(list)
        for word, pos in terms:
            wordToPos[word].append(pos)

        # append to posting list {term : [(docID1, [pos1, pos2, pos3, pos4])]}
        for term, arr in wordToPos.items():
            postingList[term].append((curDocID, wordToPos[term]))

        # For every file update id 
        curDocID += 1
    
    endTime = time.time()
    print(f"Index built in {endTime - startTime} seconds.")
    return postingList

In [6]:
result = createPostingList(path)

Index built in 0.0018129348754882812 seconds.


In [7]:
newPostingList = collections.defaultdict(list)
idToTerm = {}
curTermIndex = 0

In [8]:
stopList = set()
stopTuples = tokenize("stop-list.txt", ".")
for t, p in stopTuples:
    stopList.add(t)
print(stopList)

{'on', 'and', 'as', 'from', 'in', 'a', 'with', 'will', 'is', 'were', 'an', 'at', 'are', 'he', 'by', 'it', 'its', 'of', 'has', 'that', 'be', 'was', 'to', 'for', 'the'}


In [9]:
for k,v in result.items():
    # if k in stopList:
    #     continue
    idToTerm[curTermIndex] = k
    newPostingList[curTermIndex].append(math.log10(len(indToDoc)/len(v)))
    
    for docId, posList in v:
        newPostingList[curTermIndex].append((docId, 1 + math.log10(len(posList)), posList))
    curTermIndex += 1

In [10]:
# CHAMP
championsList = collections.defaultdict(list)
r = 2
# For each term, compute the r docs of highest weight in t's postrings
for k, v in newPostingList.items():
    championsList[k].append(v[0])
    docs = v[1:]
    docs.sort(key = lambda x: x[1], reverse = True)
    topR = docs[:r]
    championsList[k].extend(topR)

for k, v in championsList.items():
    print(f"{k} : {v}")

0 : [0.17609125905568124, (0, 1.3010299956639813, [0, 4]), (1, 1.3010299956639813, [0, 4])]
1 : [0.17609125905568124, (0, 1.3010299956639813, [1, 6]), (1, 1.0, [1])]
2 : [0.47712125471966244, (0, 1.0, [2])]
3 : [0.17609125905568124, (0, 1.0, [3]), (1, 1.0, [3])]
4 : [0.47712125471966244, (0, 1.0, [5])]
5 : [0.17609125905568124, (1, 1.0, [2]), (2, 1.0, [2])]
6 : [0.17609125905568124, (1, 1.0, [5]), (2, 1.0, [1])]
7 : [0.47712125471966244, (2, 1.0, [0])]


In [11]:
# CHAMP
champDocs = set()
for k, v in championsList.items():
    docs = v[1:]
    justDocs = [d[0] for d in docs]
    champDocs = champDocs.union(set(justDocs))
print(champDocs)

{0, 1, 2}


In [12]:
# CHAMP
docToVecChamp = {}
for k in champDocs:
    docToVecChamp[k] = len(idToTerm)*[float(0)]

In [13]:
# CHAMP
for k, v in championsList.items():
    idf = v[0]
    for i in range(1, len(v)):
        t = v[i]
        doc, w = t[0], t[1]
        docToVecChamp[doc][k] = idf*w

In [14]:
# CHAMP
for k, v in docToVecChamp.items():
    print(f"{indToDoc[k]} : {v}")

d1.txt : [0.22910001000567795, 0.22910001000567795, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.0, 0.0, 0.0]
d2.txt : [0.22910001000567795, 0.17609125905568124, 0.0, 0.17609125905568124, 0.0, 0.17609125905568124, 0.17609125905568124, 0.0]
d3.txt : [0.0, 0.0, 0.0, 0.0, 0.0, 0.17609125905568124, 0.17609125905568124, 0.47712125471966244]


In [15]:
def cosine_similarity(v1,v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

In [16]:
query = "the dog barked jumped"
terms = []
line = re.sub(r'[^A-Za-z\s]+', '', query)
for i, word in enumerate(line.split()):
    terms.append((word.lower(), i))
print(terms)

termToFreq = {}
for t, p in terms:
    termToFreq[t] = termToFreq.get(t, 0) + 1
print(termToFreq)

[('the', 0), ('dog', 1), ('barked', 2), ('jumped', 3)]
{'the': 1, 'dog': 1, 'barked': 1, 'jumped': 1}


In [18]:
termToId = {}
for id, term in idToTerm.items():
    termToId[term] = id

In [19]:
#CHAMP
queryVectorChamp = len(idToTerm)*[float(0)]
for term, freq in termToFreq.items():
    if term in termToId:
        w = 1 + math.log10(freq)
        idf = newPostingList[termToId[term]][0]
        queryVectorChamp[termToId[term]] = w*idf

In [20]:
# CHAMP
docSimilarityChamp = []
for doc, vec in docToVecChamp.items():
    docSimilarityChamp.append((doc, cosine_similarity(queryVectorChamp, vec)))

In [21]:
print(docSimilarityChamp)

[(0, 0.708098191523152), (1, 0.43022968507652687), (2, 0.1017424218677466)]
