In [1]:
# Importing necessary modules
from prettytable import PrettyTable
import requests
from bs4 import BeautifulSoup

In [2]:
# Preprocessing a list of words of a document
def preprocess(words):
    for i in range(len(words)):
        word = words[i].strip()
        if not word[0].isalnum():
            word = word[1:]
        if not word[-1].isalnum():
            word = word[:-1]
        words[i] = word.lower()
    return words

# Get the offsets of a word in a list of words
def getoffsets(words, word):
    offsets = []
    for i in range(len(words)):
        if words[i]==word:
            offsets.append(i)
    return offsets

# Create index of passed document 'doc' based on its 'doc_id'
def createindex(doc, doc_id):
    words = doc.split(' ')
    words = [word for word in words if word!='']
    words = preprocess(words)
    index = {}
    for i in set(words):
        offsets = getoffsets(words, i)
        postings = [doc_id, offsets]
        index[i] = [words.count(i), [postings]]
    return index

# Merge the doc_index and index
def appendindex(doc_index, index):
    for key in doc_index.keys():
        if key in index.keys():
            index[key][0] += doc_index[key][0]
            index[key][1] += doc_index[key][1]
        else:
            index[key] = doc_index[key]
    return index

# Print the index in tabular format
def printindex(index):
    table = PrettyTable(['Word', 'Frequency', 'Postings'])
    for word in sorted(index.keys()):
        frequency, postings = index[word][0], index[word][1]
        table.add_row([word, frequency, postings])
    print(table)

In [3]:
# Create the index of sample input files
index = {}
files = ['input1.txt', 'input2.txt', 'input3.txt']
doc_id = 0
for file in files:
    doc_id += 1
    content = None
    with open(file, 'r') as content_file:
        content = content_file.read()
    doc_index = createindex(content, doc_id)
    index = appendindex(doc_index, index)

In [4]:
printindex(index)

+--------------+-----------+---------------------------------+
|     Word     | Frequency |             Postings            |
+--------------+-----------+---------------------------------+
|     and      |     1     |            [[2, [8]]]           |
|    black     |     3     | [[1, [3]], [2, [11]], [3, [0]]] |
|     cash     |     1     |            [[2, [1]]]           |
|  corruption  |     1     |            [[1, [0]]]           |
|   creation   |     1     |            [[2, [9]]]           |
|   currency   |     1     |            [[3, [8]]]           |
| denomination |     1     |            [[3, [7]]]           |
|  excessive   |     1     |            [[2, [0]]]           |
|  generation  |     1     |            [[1, [5]]]           |
|     high     |     1     |            [[3, [6]]]           |
|      in      |     3     |  [[1, [2]], [2, [4]], [3, [5]]] |
|      is      |     1     |            [[3, [2]]]           |
|    money     |     3     | [[1, [4]], [2, [12]], [3, 

In [5]:
# Create the index of four of Robert Frost's poems
index = {}
links = ['https://www.poemhunter.com/poem/the-road-not-taken/', 'https://www.poemhunter.com/poem/stopping-by-woods-on-a-snowy-evening-2/', 'https://www.poemhunter.com/poem/nothing-gold-can-stay/', 'https://www.poemhunter.com/poem/a-question/']

In [6]:
for id in range(len(links)):
    link = links[id]
    response = requests.get(link)
    soup = BeautifulSoup(response.content, 'html.parser')
    poem = str(soup.find_all('p')[1])
    poem = poem[4:-4].replace('<br/>', ' ').strip()
    print('Parsed poem #{}:\n{}\n'.format(id, poem))
    poem_index = createindex(poem, id)
    index = appendindex(poem_index, index)

Parsed poem #0:
Two roads diverged in a yellow wood, And sorry I could not travel both And be one traveler, long I stood And looked down one as far as I could To where it bent in the undergrowth;   Then took the other, as just as fair, And having perhaps the better claim Because it was grassy and wanted wear, Though as for that the passing there Had worn them really about the same,  And both that morning equally lay In leaves no step had trodden black. Oh, I kept the first for another day!  Yet knowing how way leads on to way I doubted if I should ever come back.  I shall be telling this with a sigh Somewhere ages and ages hence: Two roads diverged in a wood, and I, I took the one less traveled by, And that has made all the difference.

Parsed poem #1:
Whose woods these are I think I know. His house is in the village, though;  He will not see me stopping here To watch his woods fill up with snow.  My little horse must think it queer To stop without a farmhouse near Between the woods an

In [7]:
for word in sorted(index.keys()):
    freq, postings = index[word][0], index[word][1]
    print('{}: {}'.format(word, freq))
    for i in postings:
        print('\t{}'.format(i))
    print()

a: 7
	[0, [4, 115, 126]]
	[1, [40, 60]]
	[2, [13]]
	[3, [0]]

about: 1
	[0, [69]]

ages: 2
	[0, [118, 120]]

all: 2
	[0, [141]]
	[3, [16]]

an: 1
	[2, [18]]

and: 15
	[0, [7, 14, 21, 45, 55, 72, 119, 128, 137]]
	[1, [46, 78, 86, 94, 101]]
	[3, [8]]

another: 1
	[0, [91]]

are: 2
	[1, [3, 83]]

as: 5
	[0, [25, 27, 41, 43, 59]]

ask: 1
	[1, [63]]

back: 1
	[0, [108]]

be: 2
	[0, [15, 111]]

because: 1
	[0, [51]]

before: 2
	[1, [98, 105]]

bells: 1
	[1, [59]]

bent: 1
	[0, [33]]

better: 1
	[0, [49]]

between: 1
	[1, [43]]

birth: 1
	[3, [27]]

black: 1
	[0, [84]]

both: 2
	[0, [13, 73]]

but: 2
	[1, [88]]
	[2, [15]]

by: 1
	[0, [136]]

can: 1
	[2, [38]]

claim: 1
	[0, [50]]

come: 1
	[0, [107]]

could: 2
	[0, [10, 29]]

dark: 1
	[1, [85]]

darkest: 1
	[1, [50]]

dawn: 1
	[2, [31]]

day: 2
	[0, [92]]
	[2, [35]]

deep: 1
	[1, [87]]

difference: 1
	[0, [143]]

diverged: 2
	[0, [2, 124]]

doubted: 1
	[0, [102]]

down: 2
	[0, [23]]
	[2, [33]]

downy: 1
	[1, [79]]

early: 1
	[2, [11]]

earth: