In [171]:
import re
import collections
import os
import time

In [172]:
def tokenize(f):
    #Returns [(term, pos), (term, pos) ...]
    terms = []
    with open("collection/" + f) as file:
        line = file.readline()
        while line:
            # Regex to match only strings and spaces 
            line = re.sub(r'[^A-Za-z\s]+', '', line)
            for i, word in enumerate(line.split()):
                terms.append((word.lower(), file.tell()+i))
            line = file.readline() 
    return terms

In [173]:
indToDoc = {}
def createPostingList():
    postingList = collections.defaultdict(list)
    curDocID = 0

    for f in os.listdir("collection"):
        indToDoc[curDocID] = f
        terms = tokenize(f)

        # create map {word:[pos1, pos2, pos3]}
        wordToPos = collections.defaultdict(list)
        for word, pos in terms:
            wordToPos[word].append(pos)

        # append to posting list {term : [(docID1, [pos1, pos2, pos3, pos4])]}
        for term, arr in wordToPos.items():
            postingList[term].append((curDocID, wordToPos[term]))

        # For every file update id 
        curDocID += 1
    
    return postingList
    

In [174]:
startTime = time.time()
result = createPostingList()
endTime = time.time()

In [175]:
print(f"Index built in {endTime - startTime} seconds.")

Index built in 1.0126490592956543 seconds.


In [182]:
def mergeTwo(A, B):
    if not B:
        return A

    a = b = 0
    res = []
    while a < len(A) and b < len(B):
        if A[a][0] > B[b][0]:
            b += 1
        elif A[a][0] < B[b][0]:
            a += 1
        else:
            res.append((A[a][0], []))
            a += 1
            b += 1
    return res

In [184]:
def andQuery(queryTerms):
    postingLists = [result[q] for q in queryTerms]

    while len(postingLists) > 1:
        temp = []
        for i in range(0, len(postingLists), 2):
            if i == len(postingLists) - 1:
                temp.append(postingLists[i])
            else:
                temp.append(mergeTwo(postingLists[i], postingLists[i+1]))
        postingLists = temp

    res = []
    for d, a in postingLists[0]:
        res.append(indToDoc[d])
    return res

In [191]:
"""
query = "with without yemen"
fixedQuery = re.sub(r'[^A-Za-z\s]+', '', query)
queryTerms = fixedQuery.lower().split()
"""
queryTerms = ['with', 'without', 'yemen']

queryTimeStart = time.time()
resultAnd = andQuery(queryTerms)
queryTimeEnd = time.time()

andStr = " AND ".join(queryTerms)
print(f"Results for the Query: {andStr}")
print(f"Total Docs retrieved: {len(resultAnd)}")
for doc in resultAnd:
    print(doc)
print(f"Retreived in {queryTimeEnd - queryTimeStart} seconds")

Results for the Query: with AND without AND yemen
Total Docs retrieved: 6
Text-159.txt
Text-86.txt
Text-115.txt
Text-117.txt
Text-121.txt
Text-99.txt
Retreived in 0.006040096282958984 seconds
