## load dataset

In [1]:
from xml.etree.ElementTree import parse

In [2]:
def load_xml(file):
    docs_dict = {}
    doc_xml = parse(file)
    root = doc_xml.getroot()
    for item in root:
        docID = item[0].text
        headline = item[3].text
        text = item[4].text
        content = headline+text
        docs_dict[docID] = content
    return docs_dict

In [3]:
def load_stop_words(file):
    with open(file,encoding='utf-8-sig') as f:
        stop_words = [w.strip('\n') for w in f.readlines()]
    return stop_words

## Pre-processes text

In [4]:
import re

In [5]:
from nltk.stem import PorterStemmer

In [6]:
def preprocess(text):
    preprocessed_tokens = []
    stemmer = PorterStemmer()
    pattern = r"\w+"
    tokens = re.findall(pattern,text)
    for token in tokens:
        if token.lower() not in stop_words:
            preprocessed_tokens.append(stemmer.stem(token.lower()))
    return preprocessed_tokens

## Creates a positional inverted index

In [7]:
from collections import defaultdict, OrderedDict

In [8]:
def positional_inverted_index():
    index_dict = defaultdict(lambda:defaultdict(list))
    for docID,content in docs_dict.items():
        tokens = preprocess(content)
        for position,token in enumerate(tokens):
            index_dict[token][docID].append(position+1)
    ordered_index_dict = OrderedDict(sorted(index_dict.items()))
    return ordered_index_dict

In [9]:
def write_index_to_file(index):
    with open('index.txt', 'w', encoding='utf-8') as f:
        for term in index.keys():
                line = term + ':' + str(len(index[term])) +'\n'
                for docID in index[term].keys():
                    position_list = index[term][docID]
                    line += '\t' + str(docID) + ': '+ ','.join(str(position) for position in position_list)+ '\n'
                f.write(line)            

## write positional inverted index to file

In [10]:
# xml_path = "/home/congw/Projects/IR/sample/trec.sample.xml"
# stop_word_path = "/home/congw/Projects/IR/englishST.txt"
docs_dict = load_xml(xml_path)
stop_words = load_stop_words(stop_word_path)
index = positional_inverted_index()
write_index_to_file(index)

## Boolean search

In [30]:
def tokenize_query(query):
    query_list = query.split(' ')
    return query_list

In [31]:
def parse_proximity_query(query):
    proximity_parse = re.findall(r'#([0-9]+?)\((.+?)\)', query)
    max_distence = int(proximity_parse[0][0])
    query_terms = proximity_parse[0][1].split(',')
    preprocessed_query_terms = [PorterStemmer().stem(t.lower()) for t in query_terms]
    return preprocessed_query_terms,max_distence
def parse_phrasal_query(query):
    phrasal_parse = re.findall(r'\"(.+?)\"', query)
    query_terms = phrasal_parse[0].split(' ')
    preprocessed_query_terms = [PorterStemmer().stem(t.lower()) for t in query_terms]
    return preprocessed_query_terms

In [33]:
def linear_merge(terms,max_dist,search="Phrasal"):
    posting_lists = [index[term] for term in terms]
    docNums = [sorted(posting_list.keys()) for posting_list in posting_lists]
    result = []
    intersection_docNums = list(set(docNums[0])|set(docNums[1]))
    for intersection_docNum in intersection_docNums:
        left_term_position = posting_lists[0][intersection_docNum]             
        right_term_position = posting_lists[1][intersection_docNum]                         
        abs_distences = [abs(j - i) for i in left_term_pos for j in right_term_pos]
        if search=="Phrasal":
            if max_dist in abs_distences:
                result.append(intersection_docNum)
        else:
            if any([abs_distence <= max_dist for abs_distence in abs_distences]):
                result.append(intersection_docNum)
    return result

In [34]:
def single_search(query):
    def is_find(words):
        flag = 1
        for word in words:
            if word not in index.keys():
                flag=0
        return flag
        
    if '\"' in query:
        query_terms = parse_phrasal_query(query)
        if is_find(query_terms):
            return linear_merge(query,1,"Phrasal")
    
    elif '#' in query:
        query_terms,max_ditence = parse_proximity_query(query)
        if is_find(query_terms):
            return linear_merge(query,max_ditence,"Proximity")
    else:
        query_term = [PorterStemmer().stem(query.lower())]
        if is_find(query_term):
            return sorted(index[query_term[0]].keys())
    

In [None]:
def bool_search(query):
    query_lst = tokenize_query(query)
    key_words = ["AND","OR","NOT"]
    if len(query_lst) == 1:
        return single_search(query_lst[0])
    
    if len(query_lst) == 3:
    
        if query_sentence[1] == "AND":
            word1 = query_sentence[0]
            word2 = query_sentence[2]
            return set(single_search(word1)) & set(single_search(word2))

        else:
            word1 = query_sentence[0]
            word2 = query_sentence[2]
            return set(single_search(word1)) | set(single_search(word2))
    #todo
    

## tf-idf