In [1]:
import re
import collections
import os
import time

# Create Posting list
goal: {term: [(ID1,[pos1,pos2,..]), (ID2, [pos1,pos2,…]),….]}

1. We have text documents in the collection folder
2. [(term1, pos), (term2, pos) ...] for each txt file
3. create {word1: [pos1, pos2, pos3], word2: [pos1, pos2, pos3]} for each document
4. For all documents create {word1: [(doc1, [pos1, pos2, pos3]), (doc2, [pos1, pos2, pos3])}


In [2]:
indToDoc = {}
postingList = collections.defaultdict(list)
curDocID = 0

In [3]:
def tokenize(f):
    #Returns [(term, pos), (term, pos) ...]
    terms = []
    with open("collection/" + f) as file:
        line = file.readline()
        while line:
            # Regex to match only strings and spaces 
            line = re.sub(r'[^A-Za-z\s]+', '', line)
            for i, word in enumerate(line.split()):
                terms.append((word.lower(), file.tell()+i))
            line = file.readline() 
    return terms

In [4]:
def createPostingList():
    startTime = time.time()
    for f in os.listdir("collection"):
        # TODO: remove in real code
        global curDocID
        
        indToDoc[curDocID] = f

        # Returns [(word, pos), (word, pos) ...]
        terms = tokenize(f)

        # create map {word:[pos1, pos2, pos3]}
        wordToPos = collections.defaultdict(list)
        for word, pos in terms:
            wordToPos[word].append(pos)

        # append to posting list {term : [(docID1, [pos1, pos2, pos3, pos4])]}
        for term, arr in wordToPos.items():
            postingList[term].append((curDocID, wordToPos[term]))

        # For every file update id 
        curDocID += 1
    
    endTime = time.time()
    print(f"Index built in {endTime - startTime} seconds.")
    return postingList
    

In [5]:
result = createPostingList()

Index built in 0.7220439910888672 seconds.


# Merge posting lists


In [6]:
def mergeTwo(A, B):
    if not B:
        return A

    a = b = 0
    res = []
    while a < len(A) and b < len(B):
        if A[a][0] > B[b][0]:
            b += 1
        elif A[a][0] < B[b][0]:
            a += 1
        else:
            res.append((A[a][0], []))
            a += 1
            b += 1
    return res

In [13]:
def andQuery(queryTerms):
    queryTimeStart = time.time()
    postingLists = [result[q] for q in queryTerms]

    while len(postingLists) > 1:
        temp = []
        for i in range(0, len(postingLists), 2):
            if i == len(postingLists) - 1:
                temp.append(postingLists[i])
            else:
                temp.append(mergeTwo(postingLists[i], postingLists[i+1]))
        postingLists = temp

    res = []
    for d, a in postingLists[0]:
        res.append(indToDoc[d])
    
    queryTimeEnd = time.time()
    print(f"Retreived in {queryTimeEnd - queryTimeStart} seconds")

    andStr = " AND ".join(queryTerms)
    print(f"Results for the Query: {andStr}")
    print(f"Total Docs retrieved: {len(res)}")
    for doc in res:
        print(doc)

    return res

In [14]:
queryTerms = ['with', 'without', 'yemen']
resultAnd = andQuery(queryTerms)


Retreived in 0.00019311904907226562 seconds
Results for the Query: with AND without AND yemen
Total Docs retrieved: 6
Text-159.txt
Text-86.txt
Text-115.txt
Text-117.txt
Text-121.txt
Text-99.txt


In [16]:
def printDict():
    for term, list in postingList.items():
        print(term, list)

In [None]:
printDict()

In [19]:
def printDocList():
    for doc, name in indToDoc.items():
        print(f"Doc ID: {doc} ==> {name}")

In [20]:
printDocList()

Doc ID: 0 ==> Text-163.txt
Doc ID: 1 ==> Text-177.txt
Doc ID: 2 ==> Text-188.txt
Doc ID: 3 ==> Text-407.txt
Doc ID: 4 ==> Text-361.txt
Doc ID: 5 ==> Text-375.txt
Doc ID: 6 ==> Text-413.txt
Doc ID: 7 ==> Text-43.txt
Doc ID: 8 ==> Text-349.txt
Doc ID: 9 ==> Text-57.txt
Doc ID: 10 ==> Text-80.txt
Doc ID: 11 ==> Text-94.txt
Doc ID: 12 ==> Text-215.txt
Doc ID: 13 ==> Text-201.txt
Doc ID: 14 ==> Text-229.txt
Doc ID: 15 ==> Text-1.txt
Doc ID: 16 ==> Text-228.txt
Doc ID: 17 ==> Text-200.txt
Doc ID: 18 ==> Text-214.txt
Doc ID: 19 ==> Text-95.txt
Doc ID: 20 ==> Text-81.txt
Doc ID: 21 ==> Text-56.txt
Doc ID: 22 ==> Text-348.txt
Doc ID: 23 ==> Text-42.txt
Doc ID: 24 ==> Text-374.txt
Doc ID: 25 ==> Text-412.txt
Doc ID: 26 ==> Text-406.txt
Doc ID: 27 ==> Text-360.txt
Doc ID: 28 ==> Text-189.txt
Doc ID: 29 ==> Text-176.txt
Doc ID: 30 ==> Text-162.txt
Doc ID: 31 ==> Text-174.txt
Doc ID: 32 ==> Text-160.txt
Doc ID: 33 ==> Text-148.txt
Doc ID: 34 ==> Text-410.txt
Doc ID: 35 ==> Text-376.txt
Doc ID: 36 =