In [69]:
import nltk

def word_index(text):
    '''returns for every word a tuple (index, word)'''
    tokens = nltk.word_tokenize(text)
    words=[(index,word) for index,word in enumerate(tokens)]
    return words

def inverted_positions(text):
    '''returns for every word a dictionary, where the word is the key,
    and the value contains a list of positions of that word in this document'''
    inverted = {}
    for index, word in word_index(text):
        locations = inverted.setdefault(word, [])
        locations.append(index)
    return inverted

def inverted_index_add(inverted, doc_id, doc_index):
    '''adds a new document to the inverted index'''
    for word, locations in doc_index.items():
        indices = inverted.setdefault(word, {})
        indices[doc_id] = locations
    return inverted

def create_inverted_index(docs):
    '''takes a list of documents/strings as input and outputs an inverted index'''
    inverted = {}
    for doc_id, doc in enumerate(docs):
        inverted = inverted_index_add(inverted, 'doc' + str(doc_id), inverted_positions(doc))
    
    return inverted


In [70]:
doc1='In a speech to senior Russian officials in Moscow, Putin said the possible deployment of missiles that could reach Moscow in 10 minutes was dangerous for Russia,and that Moscow would be forced to review symmetrical and asymmetrical actions'

doc2='Russia will Moscow be forced to create and deploy types of weapons, which can be used not just against those territories, from which the direct threat will come, but also against those, where the centres of decision-making for using these missile systems will come,the Russian president said'

doc3='The treaty, concluded by Ronald Reagan and Mikhail Gorbachev, banned the development and deployment of land-based missiles with a range of 500-5,500km and was widely credited with banishing nuclear missiles from Europe. The US, led by the national security adviser, John Bolton'


create_inverted_index([doc1,doc2,doc3])

{'In': {'doc0': [0]},
 'a': {'doc0': [1], 'doc2': [20]},
 'speech': {'doc0': [2]},
 'to': {'doc0': [3, 35], 'doc1': [5]},
 'senior': {'doc0': [4]},
 'Russian': {'doc0': [5], 'doc1': [50]},
 'officials': {'doc0': [6]},
 'in': {'doc0': [7, 21]},
 'Moscow': {'doc0': [8, 20, 31], 'doc1': [2]},
 ',': {'doc0': [9, 28], 'doc1': [12, 22, 30, 35, 48], 'doc2': [2, 10, 37, 44]},
 'Putin': {'doc0': [10]},
 'said': {'doc0': [11], 'doc1': [52]},
 'the': {'doc0': [12], 'doc1': [25, 37, 49], 'doc2': [12, 40]},
 'possible': {'doc0': [13]},
 'deployment': {'doc0': [14], 'doc2': [15]},
 'of': {'doc0': [15], 'doc1': [10, 39], 'doc2': [16, 22]},
 'missiles': {'doc0': [16], 'doc2': [18, 31]},
 'that': {'doc0': [17, 30]},
 'could': {'doc0': [18]},
 'reach': {'doc0': [19]},
 '10': {'doc0': [22]},
 'minutes': {'doc0': [23]},
 'was': {'doc0': [24], 'doc2': [25]},
 'dangerous': {'doc0': [25]},
 'for': {'doc0': [26], 'doc1': [41]},
 'Russia': {'doc0': [27], 'doc1': [0]},
 'and': {'doc0': [29, 38], 'doc1': [7], 'd