## load dataset

In [1]:
import pandas as pd

In [2]:
import xml.etree.ElementTree as ET

In [3]:
def load_xml(file):
    docs = pd.DataFrame(columns=['doc'])
    with open(file,encoding='utf-8') as f:
        xml = f.read()
    xml = '<ROOT>' + xml + '</ROOT>'
    for item in ET.fromstring(xml):
        docID = item.findtext('DOCNO')
        headline = item.findtext('HEADLINE')
        text = item.findtext('TEXT')
        content = headline+text
        docs_df.loc[docID] = content
    return docs

In [4]:
def load_stop_words(file):
    with open(file,encoding='utf-8-sig') as f:
        stop_words = [w.strip('\n') for w in f.readlines()]
    return stop_words

## Pre-processes text

In [5]:
import re

In [6]:
from nltk.stem import PorterStemmer

In [7]:
def preprocess(text):
    preprocessed_tokens = []
    stemmer = PorterStemmer()
    pattern = r"\w+"
    tokens = re.findall(pattern,text)
    for token in tokens:
        if token.lower() not in stop_words:
            preprocessed_tokens.append(stemmer.stem(token.lower()))
    return preprocessed_tokens

## Creates a positional inverted index

In [8]:
from collections import defaultdict, OrderedDict

In [9]:
def positional_inverted_index():
    index_dict = defaultdict(lambda:defaultdict(list))
    for index,data in docs.iterrows():
        doc = data['doc']
        tokens = preprocess(doc)
        for position in range(len(tokens)):
            index_dict[tokens[position]][index].append(pos+1)
    ordered_index_dict = OrderedDict(sorted(index.items()))

In [10]:
def write_index_to_file(index):
    with open('index.txt', 'w', encoding='utf-8') as f:
        for term in index.keys():
                line = term + ':' + '\n'
                for docID in index[term].keys():
                    position_list = index[term][docID]
                    line += '\t' + str(docID) + ': '+ ','.join(str(position) for position in position_list)+ '\n'
                f.write(line)            

## write positional inverted index to file

In [11]:
xml_path = ""
stop_word_path = ""
docs = load_xml(xml_path)
stop_words = load_stop_words(stop_word_path)
index = positional_inverted_index()
write_index_to_file(index)