In [135]:
import re
import collections
import os
import time
import math

In [136]:
indToDoc = {}
postingList = collections.defaultdict(list)
curDocID = 0
path = "practice3_collection"

In [137]:
def tokenize(f, path):
    #Returns [(term, pos), (term, pos) ...]
    terms = []
    with open(path + "/" + f) as file:
        line = file.readline()
        i = 0
        while line:
            # Regex to match only strings and spaces 
            line = re.sub(r'[^A-Za-z\s]+', '', line)
            for word in line.split():
                terms.append((word.lower(), i))
                i += 1
            line = file.readline() 
    return terms

In [138]:
def createPostingList(path):
    startTime = time.time()
    for f in os.listdir(path):
        # TODO: remove in real code
        global curDocID
        
        indToDoc[curDocID] = f

        # Returns [(word, pos), (word, pos) ...]
        terms = tokenize(f, path)

        # create map {word:[pos1, pos2, pos3]}
        wordToPos = collections.defaultdict(list)
        for word, pos in terms:
            wordToPos[word].append(pos)

        # append to posting list {term : [(docID1, [pos1, pos2, pos3, pos4])]}
        for term, arr in wordToPos.items():
            postingList[term].append((curDocID, wordToPos[term]))

        # For every file update id 
        curDocID += 1
    
    endTime = time.time()
    print(f"Index built in {endTime - startTime} seconds.")
    return postingList

In [139]:
result = createPostingList(path)

Index built in 0.0010921955108642578 seconds.


In [140]:
newPostingList = collections.defaultdict(list)
idToTerm = {}
curTermIndex = 0

In [141]:
stopList = set()
stopTuples = tokenize("stop-list.txt", ".")
for t, p in stopTuples:
    stopList.add(t)
print(stopList)

{'for', 'will', 'as', 'by', 'with', 'is', 'were', 'the', 'a', 'be', 'from', 'and', 'are', 'he', 'it', 'at', 'an', 'in', 'that', 'to', 'has', 'of', 'on', 'its', 'was'}


In [142]:
for k,v in result.items():
    # if k in stopList:
    #     continue
    idToTerm[curTermIndex] = k
    newPostingList[curTermIndex].append(math.log10(len(indToDoc)/len(v)))
    
    for docId, posList in v:
        newPostingList[curTermIndex].append((docId, 1 + math.log10(len(posList)), posList))
    curTermIndex += 1



In [143]:
print(idToTerm)

{0: 'the', 1: 'dog', 2: 'barked', 3: 'at', 4: 'other', 5: 'jumped', 6: 'cat', 7: 'a'}


In [144]:
for k, v in newPostingList.items():
    print(f"{k} : {v}")

0 : [0.17609125905568124, (0, 1.3010299956639813, [0, 4]), (1, 1.3010299956639813, [0, 4])]
1 : [0.17609125905568124, (0, 1.3010299956639813, [1, 6]), (1, 1.0, [1])]
2 : [0.47712125471966244, (0, 1.0, [2])]
3 : [0.17609125905568124, (0, 1.0, [3]), (1, 1.0, [3])]
4 : [0.47712125471966244, (0, 1.0, [5])]
5 : [0.17609125905568124, (1, 1.0, [2]), (2, 1.0, [2])]
6 : [0.17609125905568124, (1, 1.0, [5]), (2, 1.0, [1])]
7 : [0.47712125471966244, (2, 1.0, [0])]


In [145]:
# CHAMP
championsList = collections.defaultdict(list)
r = 2
# For each term, compute the r docs of highest weight in t's postrings
for k, v in newPostingList.items():
    championsList[k].append(v[0])
    docs = v[1:]
    docs.sort(key = lambda x: x[1], reverse = True)
    topR = docs[:r]
    championsList[k].extend(topR)

for k, v in championsList.items():
    print(f"{k} : {v}")

0 : [0.17609125905568124, (0, 1.3010299956639813, [0, 4]), (1, 1.3010299956639813, [0, 4])]
1 : [0.17609125905568124, (0, 1.3010299956639813, [1, 6]), (1, 1.0, [1])]
2 : [0.47712125471966244, (0, 1.0, [2])]
3 : [0.17609125905568124, (0, 1.0, [3]), (1, 1.0, [3])]
4 : [0.47712125471966244, (0, 1.0, [5])]
5 : [0.17609125905568124, (1, 1.0, [2]), (2, 1.0, [2])]
6 : [0.17609125905568124, (1, 1.0, [5]), (2, 1.0, [1])]
7 : [0.47712125471966244, (2, 1.0, [0])]


In [146]:
# CHAMP
champDocs = set()
for k, v in championsList.items():
    docs = v[1:]
    justDocs = [d[0] for d in docs]
    champDocs = champDocs.union(set(justDocs))
print(champDocs)

{0, 1, 2}


In [147]:
# CHAMP
docToVecChamp = {}
for k in champDocs:
    docToVecChamp[k] = len(idToTerm)*[float(0)]

In [148]:
termToId = {}
for id, term in idToTerm.items():
    termToId[term] = id

In [149]:
docToVec = {}
for k in indToDoc.keys():
    docToVec[k] = len(idToTerm)*[float(0)]

In [150]:
print(docToVecChamp)

{0: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 1: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 2: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}


In [151]:
print(docToVec)

{0: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 1: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 2: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}


In [152]:
# CHAMP
for k, v in championsList.items():
    idf = v[0]
    for i in range(1, len(v)):
        t = v[i]
        doc, w = t[0], t[1]
        docToVecChamp[doc][k] = idf*w

In [153]:
# CHAMP
for k, v in docToVecChamp.items():
    print(f"{indToDoc[k]} : {v}")

d1.txt : [0.22910001000567795, 0.22910001000567795, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.0, 0.0, 0.0]
d2.txt : [0.22910001000567795, 0.17609125905568124, 0.0, 0.17609125905568124, 0.0, 0.17609125905568124, 0.17609125905568124, 0.0]
d3.txt : [0.0, 0.0, 0.0, 0.0, 0.0, 0.17609125905568124, 0.17609125905568124, 0.47712125471966244]


In [154]:
for k, v in newPostingList.items():
    idf = v[0]
    for i in range(1, len(v)):
        t = v[i]
        doc, w = t[0], t[1]
        docToVec[doc][k] = idf*w

In [155]:
for k, v in docToVec.items():
    print(f"{indToDoc[k]} : {v}")

d1.txt : [0.22910001000567795, 0.22910001000567795, 0.47712125471966244, 0.17609125905568124, 0.47712125471966244, 0.0, 0.0, 0.0]
d2.txt : [0.22910001000567795, 0.17609125905568124, 0.0, 0.17609125905568124, 0.0, 0.17609125905568124, 0.17609125905568124, 0.0]
d3.txt : [0.0, 0.0, 0.0, 0.0, 0.0, 0.17609125905568124, 0.17609125905568124, 0.47712125471966244]


In [156]:
def cosine_similarity(v1,v2):
    "compute cosine similarity of v1 to v2: (v1 dot v2)/{||v1||*||v2||)"
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    return sumxy/math.sqrt(sumxx*sumyy)

In [157]:
d1Vec = docToVec[0]
d2Vec = docToVec[1]

print(cosine_similarity(d1Vec, d2Vec))

0.38332092608865265


In [158]:
print(newPostingList)


defaultdict(<class 'list'>, {0: [0.17609125905568124, (0, 1.3010299956639813, [0, 4]), (1, 1.3010299956639813, [0, 4])], 1: [0.17609125905568124, (0, 1.3010299956639813, [1, 6]), (1, 1.0, [1])], 2: [0.47712125471966244, (0, 1.0, [2])], 3: [0.17609125905568124, (0, 1.0, [3]), (1, 1.0, [3])], 4: [0.47712125471966244, (0, 1.0, [5])], 5: [0.17609125905568124, (1, 1.0, [2]), (2, 1.0, [2])], 6: [0.17609125905568124, (1, 1.0, [5]), (2, 1.0, [1])], 7: [0.47712125471966244, (2, 1.0, [0])]})


In [159]:
query = "the dog barked jumped"
terms = []
line = re.sub(r'[^A-Za-z\s]+', '', query)
for i, word in enumerate(line.split()):
    terms.append((word.lower(), i))
print(terms)

[('the', 0), ('dog', 1), ('barked', 2), ('jumped', 3)]


In [160]:
query_terms = query.split(" ")
termToIDF = []
for term in query_terms:
    if term in termToId:
        termToIDF.append((term, newPostingList[termToId[term]][0]))

termToIDF.sort(key=lambda x:x[1], reverse=True)
query_terms = [tuple[0] for tuple in termToIDF[:len(termToIDF)//2]] 
print(query_terms)

['barked', 'the']


In [161]:
termToFreq = {}
for t, p in terms:
    termToFreq[t] = termToFreq.get(t, 0) + 1
print(termToFreq)

{'the': 1, 'dog': 1, 'barked': 1, 'jumped': 1}


In [162]:
#CHAMP
queryVectorChamp = len(idToTerm)*[float(0)]
for term, freq in termToFreq.items():
    if term in termToId:
        w = 1 + math.log10(freq)
        idf = newPostingList[termToId[term]][0]
        queryVectorChamp[termToId[term]] = w*idf

In [163]:
print(queryVectorChamp)

[0.17609125905568124, 0.17609125905568124, 0.47712125471966244, 0.0, 0.0, 0.17609125905568124, 0.0, 0.0]


In [164]:
queryVector = len(idToTerm)*[float(0)]
for term, freq in termToFreq.items():
    if term in termToId:
        w = 1 + math.log10(freq)
        idf = championsList[termToId[term]][0]
        queryVector[termToId[term]] = w*idf

In [165]:
print(queryVector)

[0.17609125905568124, 0.17609125905568124, 0.47712125471966244, 0.0, 0.0, 0.17609125905568124, 0.0, 0.0]


In [166]:
import random
leadersArr = random.sample(range(0, len(indToDoc)), int(math.sqrt(len(indToDoc))))
leadersSet = set(leadersArr)

docToNearestLeader = {}
for doc in range(len(indToDoc)):
    docVec = docToVec[doc]
    distNearest = float("-inf")
    nearest = None
    for l in leadersArr:
        newDist = cosine_similarity(docVec, docToVec[l])
        if newDist >= distNearest:
            distNearest = newDist
            nearest = l
    docToNearestLeader[doc] = nearest

print(docToNearestLeader)

{0: 1, 1: 1, 2: 1}


In [167]:
leaderAndFollowers = collections.defaultdict(list)
for k, v in docToNearestLeader.items():
    leaderAndFollowers[v].append(k)
print(leaderAndFollowers)

defaultdict(<class 'list'>, {1: [0, 1, 2]})


In [170]:
# Sort leaders by distance to query vector
leadersAndDist = []
for l in leadersArr:
    v = docToVec[l]
    distToQuery = cosine_similarity(v, queryVector)
    leadersAndDist.append((l, distToQuery))
leadersAndDist.sort(key=lambda x:x[1])
print(leadersAndDist)

[(1, 0.43022968507652687)]


In [178]:
res = []
k = 2
for l, dist in leadersAndDist:
    cluster = leaderAndFollowers[l]
    # Sort each element in cluster by distance to query vector
    clusterDistToQuery = []
    for n in cluster:
        dist = cosine_similarity(docToVec[n], queryVector)
        clusterDistToQuery.append((n, dist))
    clusterDistToQuery.sort(key=lambda x:x[1])
    res.extend([x[0] for x in clusterDistToQuery])
    if len(res) >= k:
        break
print(res[:k])


[2, 1]


In [78]:
# CHAMP
docSimilarityChamp = []
for doc, vec in docToVecChamp.items():
    docSimilarityChamp.append((doc, cosine_similarity(queryVectorChamp, vec)))

In [208]:
docToSimilarity = []
for doc, vec in docToVec.items():
    docToSimilarity.append((doc, cosine_similarity(queryVector, vec)))

In [80]:
print(docSimilarityChamp)

[(0, 0.708098191523152), (1, 0.43022968507652687), (2, 0.1017424218677466)]


In [77]:
for k, v in newPostingList.items():
    print(f"{idToTerm[k]} : {v}")

the : [0.0, (0, 1.3010299956639813, [37, 39]), (1, 1.3010299956639813, [48, 53]), (2, 1.0, [47]), (3, 1.0, [45])]
best : [0.0, (0, 1.3010299956639813, [38, 40]), (1, 1.3010299956639813, [49, 54]), (2, 1.0, [48]), (3, 1.0, [46])]
american : [0.3010299956639812, (0, 1.0, [41]), (2, 1.0, [44])]
restaurant : [0.0, (0, 1.0, [42]), (1, 1.0, [51]), (2, 1.0, [45]), (3, 1.0, [43])]
italian : [0.6020599913279624, (1, 1.0, [50])]
enjoy : [0.12493873660829993, (1, 1.0, [52]), (2, 1.0, [46]), (3, 1.0, [44])]
pasta : [0.6020599913279624, (1, 1.0, [55])]
hamburger : [0.6020599913279624, (2, 1.0, [49])]
korean : [0.6020599913279624, (3, 1.0, [42])]
bibimbap : [0.6020599913279624, (3, 1.0, [47])]


In [12]:
for k,v in result.items():
    print(f"{k} : {v}")

the : [(0, [14]), (1, [14])]
dog : [(0, [15]), (1, [15])]
barked : [(0, [16])]
jumped : [(1, [16]), (2, [14])]
a : [(2, [12])]
cat : [(2, [13])]


In [11]:
for k, v in result.items():
    print(f"{k} : {v} idf: {math.log10(len(indToDoc)/len(v))}")

the : [(0, [14]), (1, [14])] idf: 0.17609125905568124
dog : [(0, [15]), (1, [15])] idf: 0.17609125905568124
barked : [(0, [16])] idf: 0.47712125471966244
jumped : [(1, [16]), (2, [14])] idf: 0.17609125905568124
a : [(2, [12])] idf: 0.47712125471966244
cat : [(2, [13])] idf: 0.47712125471966244


In [17]:
for k, v in result.items():
    for docId, posList in v:
        print(f"{docId}, {posList} term frequency = {len(posList)} w = {1 + math.log10(len(posList))}")

0, [14] term frequency = 1 w = 1.0
1, [14] term frequency = 1 w = 1.0
0, [15] term frequency = 1 w = 1.0
1, [15] term frequency = 1 w = 1.0
0, [16] term frequency = 1 w = 1.0
1, [16] term frequency = 1 w = 1.0
2, [14] term frequency = 1 w = 1.0
2, [12] term frequency = 1 w = 1.0
2, [13] term frequency = 1 w = 1.0


In [6]:
N = 1000000
dft = 100
print(math.log10(N/dft))

4.0


In [15]:
print(math.log10(10/0))

ZeroDivisionError: division by zero