## load dataset

In [1]:
from xml.etree.ElementTree import parse

In [2]:
def load_xml(file):
    docs_dict = {}
    doc_xml = parse(file)
    root = doc_xml.getroot()
    for item in root:
        docID = item[0].text
        headline = item[3].text
        text = item[4].text
        content = headline+text
        docs_dict[docID] = content
    return docs_dict

In [3]:
def load_stop_words(file):
    with open(file,encoding='utf-8-sig') as f:
        stop_words = [w.strip('\n') for w in f.readlines()]
    return stop_words

## Pre-processes text

In [4]:
import re

In [5]:
from nltk.stem import PorterStemmer

In [6]:
def preprocess(text):
    preprocessed_tokens = []
    stemmer = PorterStemmer()
    pattern = r"\w+"
    tokens = re.findall(pattern,text)
    for token in tokens:
        if token.lower() not in stop_words:
            preprocessed_tokens.append(stemmer.stem(token.lower()))
    return preprocessed_tokens

## Creates a positional inverted index

In [7]:
from collections import defaultdict, OrderedDict

In [16]:
def positional_inverted_index():
    index_dict = defaultdict(lambda:defaultdict(list))
    for docID,content in docs_dict.items():
        tokens = preprocess(content)
        for position,token in enumerate(tokens):
            index_dict[token][docID].append(position+1)
    ordered_index_dict = OrderedDict(sorted(index_dict.items()))
    return ordered_index_dict

In [9]:
def write_index_to_file(index):
    with open('index.txt', 'w', encoding='utf-8') as f:
        for term in index.keys():
                line = term + ':' + len(term) +'\n'
                for docID in index[term].keys():
                    position_list = index[term][docID]
                    line += '\t' + str(docID) + ': '+ ','.join(str(position) for position in position_list)+ '\n'
                f.write(line)            

## write positional inverted index to file

In [17]:
xml_path = "/home/congw/Projects/IR/sample/trec.sample.xml"
stop_word_path = "/home/congw/Projects/IR/englishST.txt"
docs_dict = load_xml(xml_path)
stop_words = load_stop_words(stop_word_path)
index = positional_inverted_index()
write_index_to_file(index)

In [18]:
index

OrderedDict([('0',
              defaultdict(list,
                          {'15': [20, 31, 34, 285],
                           '59': [11],
                           '128': [181],
                           '319': [93],
                           '351': [1198,
                            1202,
                            1240,
                            1284,
                            1560,
                            1732,
                            1747,
                            1750,
                            1758,
                            1828],
                           '3351': [32, 34, 276, 285],
                           '3419': [69, 79, 91, 133, 135, 155],
                           '3461': [64, 165, 273],
                           '3489': [17],
                           '3559': [37, 159, 172, 180, 193, 199],
                           '3609': [11],
                           '3610': [62],
                           '3614': [13],
                           '3

## Boolean search