In [1]:
import sys 
sys.path.append('..')

In [2]:
STARTING_DOCUMENT_INDEX = 54711
TOTAL_NUMBER_OF_DOCUMENTS = 293856

CORPUS_PATH = '../data/ohsumed.88-91'
QUERY_PATH = '../data/query.ohsu.1-63'

In [45]:
import re
from collections import Counter 
import numpy as np 

def collect_document_ids(corpus):
    '''collect a single list of document_ids given the database file'''
    document_ids = []
    for document in corpus: 

        pattern = "\.U\n(.*?)\n"
        doc_id_find = re.search(pattern, document)
        if doc_id_find:
            doc_id = doc_id_find.group(1)
            document_ids.append(doc_id)
    return document_ids


def split_into_documents(database_file):
    '''Splits the singular corpus into an array of documents'''
    database_file = open(database_file)
    data = database_file.read()
    pattern = re.compile("\.I\s\d*\n")
    corpus = re.split(pattern, data)[1:]  # The first item is ''
    return corpus



def clean_entry(entry):
    '''Clean a single document by removing headers, punctuation, tabs, spaces, newlines, and digits'''
    # Remove . Headers
    entry = re.sub('\.[A-Z]', '', entry)

    # Remove punctuation
    entry = re.sub(r'[^\w\s]', ' ', entry)

    # Remove \n, \r and \t
    entry = re.sub(r'[\n\t\r]', ' ', entry)

    # Remove digits
    entry = re.sub(r'[\d]', '', entry)

    # Remove double spaces
    entry = re.sub(' +', ' ', entry)

    # Remove isolated small case letters
    entry = re.sub(r"\b\d+\b *|\b[a-z]\b *","",entry)

    # Remove leading and trailing spaces
    entry = entry.strip()

    return entry

def process_query(query):
    patterns = ['<num>.*\n', '<desc>.*\n', '<.*>']
    query_id_find = re.search('<num>\sNumber:\s(.+?)\n', query)
    if query_id_find:
        query_id = query_id_find.group(1)
    for pattern in patterns: 
        query = re.sub(pattern, ' ', query)

    # Split query to remove stopwords first
    query = query.split(' ')
    query = [q.lower() for q in query]
    stop_words_list = read_stopwords_file('stop_words_english.txt')
    query = remove_stopwords(query, stop_words_list)

    # join the query again for the cleaning process
    query = ' '.join(query)
    query = clean_entry(query)

    # Split query into tokens 
    query = query.split(' ')
    
    
    return query, query_id

def read_stopwords_file(filename):
    '''Read stopwords from an external file'''
    with open('stopwords.txt') as f:
        stopwords_string = f.read()
        stop_words_list = stopwords_string.split('\n')
    return stop_words_list

def remove_stopwords(tokens, stop_words_list):
    '''Remove all stopwords from a list of tokens using a stop_words_list'''
    return [tok for tok in tokens if tok not in stop_words_list]

In [46]:
from collections import Counter
# from src.data_processing import clean_entry, read_stopwords_file, remove_stopwords
import numpy as np
import json 



def create_inverted_index(corpus, doc_ids):
    '''Created inverted index for all unique terms with (document_ids, term_frequncies) in posting list'''
    # Create empty inverted index 
    inverted_index = {}
    document_lengths = np.zeros(len(corpus))

    # Read stopwords from a stopword file 
    stopwords_list = read_stopwords_file('stop_words_english.txt')

    # Iterate over the entire courpus of list of documents 
    for i, document in enumerate(corpus):
        doc_id = i


        document = document.split(" ")

        # Remove stopwords from document 
        document = remove_stopwords(document, stopwords_list)
        document = [token.lower() for token in document]
        
        document = ' '.join(document)
    
        # Clean document 
        document = clean_entry(document)

        # Tokenize document 
        tokens = document.split(" ")

        # Calculate and store document lengths 
        document_lengths[doc_id] = len(tokens)

        # Create a dictionary which stores term frequencies 
        term_frequencies = Counter(tokens)

        # Euclidian normalize the term frequencies 
        # denom = np.sum(np.array([(count)**2 for count in term_frequencies.values()]))
        # denom = np.sqrt(denom)            
        # Iterate over all unique terms 
        for term in term_frequencies.keys():
            # term_frequencies[term] /= deno
            # If the term already exists in inverted index, append the current (doc_id, term_frequency) to the postings list                
            if term in inverted_index:
                inverted_index[term].append((doc_id, term_frequencies[term]))
            # If the term does not exist in inverted index, create a entry with the term as key nd add (doc_id, term_frequency) to the posting list
            else:
                inverted_index[term] = [(doc_id, term_frequencies[term])]
    return inverted_index, document_lengths

def get_document_norm(corpus):
    norms = np.zeros(len(corpus))
    for i, document in enumerate(corpus): 
        term_frequencies = Counter(document)
        norm = np.sum(np.array([(count)**2 for count in term_frequencies.values()]))
        norm = np.sqrt(norm) 
        norms[i] = norm 
    return norms  

def calculate_idf(inverted_index, num_documents):
    '''Calculate the inverse document frequency for all unique terms in the vocabulary'''
    # Create empty idf dictionary 
    idf = {}

    # Iterate over all terms in inverted index 
    for item in inverted_index.keys():
        # IDF_t = log( N / number_of_documents_t_appears_in)
        idf[item] = np.log(num_documents/(len(inverted_index[item])))
    
    return idf 

def save_inverted_index(inverted_index):
    '''Save the inverted index to a json file'''
    with open('inverted_index.json', 'w') as j:
        json.dump(inverted_index, j)

In [79]:
def get_relevant_documents(rel_file):
    
    rel_docs = []
    with open(rel_file) as f:
        rel_file_string = f.read()
    for i in range(63):
        all_current_docs = re.search('OHSU1\t0(.*?)\t[0-9]', rel_file_string)



In [82]:
rel_file_string = get_relevant_documents('../data/qrels.ohsu.88-91')

In [101]:
class config:
    norm = 1.
    alpha = 0.7

In [98]:
import numpy as np 
# from src.config import TOTAL_NUMBER_OF_DOCUMENTS, STARTING_DOCUMENT_INDEX
# from src.data_processing import process_query, remove_stopwords, read_stopwords_file, clean_entry
from collections import Counter 


def compute_tf_idf_scores(inverted_index, query, idf, doc_ids, norms, Lengths):
    scores = np.zeros(len(doc_ids))
    tf_query = Counter(query)
    for t in tf_query.keys():
        if t in inverted_index:
            w_tq = ( tf_query[t] ) * (idf[t] ** 2 )
            postings_list = inverted_index[t]
            for (d, tf_td) in postings_list:
                # Using the median of norms worked better than the respective norm 
                scores[d] += (tf_td / config.norm)  * w_tq 
    for i in range(len(doc_ids)):
        scores[i] /= Lengths[i]
    
    return scores 

def compute_relevance_feedback_scores(inverted_index, query, relevant_docs, idf, doc_ids, norms, Lengths):
    scores = np.zeros(len(doc_ids))
    tf_query = Counter(query)
    
    for t in tf_query.keys():
        if t in inverted_index:
            w_tq = ( tf_query[t] ) * (idf[t] ** 2 )
            postings_list = inverted_index[t]
            for (d, tf_td) in postings_list:
                # Using the median of norms worked better than the respective norm 
                scores[d] += (tf_td / config.norm)  * w_tq 
    for document in relevant_docs:
        tf_doc = Counter(document)
        for t in tf_doc.keys():
            if t in inverted_index:
                w_tq = ( tf_doc[t] ) * (idf[t] ** 2 )
                postings_list = inverted_index[t]
                for (d, tf_td) in postings_list:
                    # Using the median of norms worked better than the respective norm 
                    scores[d] += (tf_td / config.norm)  * w_tq 
                    
    for i in range(len(doc_ids)):
        scores[i] /= Lengths[i]
    
    return scores 

def compute_discounted_relevance_feedback_scores(inverted_index, query, relevant_docs, idf, doc_ids, norms, Lengths):
    scores = np.zeros(len(doc_ids))
    tf_query = Counter(query)
    for t in tf_query.keys():
        if t in inverted_index:
            w_tq = ( tf_query[t] ) * (idf[t] ** 2 )
            postings_list = inverted_index[t]
            for (d, tf_td) in postings_list:
                # Using the median of norms worked better than the respective norm 
                scores[d] += config.alpha*(tf_td / config.norm)  * w_tq 
    for enum, document in enumerate(relevant_docs):
        tf_doc = Counter(document)
        for t in tf_doc.keys():
            if t in inverted_index:
                w_tq = ( tf_doc[t] ) * (idf[t] ** 2 )
                postings_list = inverted_index[t]
                for (d, tf_td) in postings_list:
                    # Discount factor scaled by rank of document in question 
                    scores[d] += ((1 - config.alpha)**(enum + 1))*(tf_td / config.norm)  * w_tq 
                    
    for i in range(len(doc_ids)):
        scores[i] /= Lengths[i]
    
    return scores 

def compute_tf_scores(inverted_index, query, doc_ids, Lengths):
    scores = np.zeros(len(doc_ids))
    tf_query = Counter(query)
    for t in tf_query.keys():
        if t in inverted_index:
            w_tq = tf_query[t] * 1. # No IDF multiplication 
            postings_list = inverted_index[t]
            for (d, tf_td) in postings_list:
                scores[d] += tf_td * w_tq 
    for i in range(len(doc_ids)):
        scores[i] /= Lengths[i]
    
    return scores 

def compute_boolean_scores(inverted_index, query, doc_ids): 
    scores = np.zeros(len(doc_ids))
    query_norm = 1 / len(query)
    for t in query:
        if t in inverted_index:
            postings_list = inverted_index[t]
            for (d, _) in postings_list:
                scores[d] += d * query_norm 

    # Tolerance of 0.75, If a document has more than 75% of the non stop-words terms in the query, it gets a score of 1 
    scores = scores[ scores > 0.75]
    scores = [int(s) for s in scores]
    return scores


def get_top_k_documents(doc_ids, scores, k):
    idx = (-scores).argsort()[:k]
    top_k_docs = [(i, doc_ids[i], scores[i]) for i in idx]
    return top_k_docs

def generate_log_string(top_k_docs, query_id):
    log_string = ''
    for rank, (relative_index, doc_id, score)  in enumerate(top_k_docs):
        log_string += f'{query_id} 0 {doc_id} {rank+1} {score} tf-idf\n'
    return log_string

def retrieve_documents(method, corpus, query, inverted_index, idf, doc_ids, norms, Lengths):  
    if method == 'tf-idf':
        scores = compute_tf_idf_scores(inverted_index, query, idf, doc_ids, norms, Lengths)
        top_k_docs = get_top_k_documents(doc_ids, scores, 50)        

    elif method =='tf':
        scores = compute_tf_scores(inverted_index, query, doc_ids, Lengths)
        top_k_docs = get_top_k_documents(doc_ids, scores, 50)
    
    elif method =='boolean':
        scores = compute_boolean_scores(inverted_index, query, doc_ids)
        top_k_docs = get_top_k_documents(doc_ids, scores, 50)

    elif method == 'relevance-feedback':
        scores = compute_tf_idf_scores(inverted_index, query, idf, doc_ids, norms, Lengths)
        relevant_docs = get_top_k_documents(doc_ids, scores, 5)
        stopwords_list = read_stopwords_file('stopwords.txt')
        relevant_docs_processed = []
        for (relative_index, document_id, score) in relevant_docs:
            extra_document = corpus[relative_index]
            # Clean document 
            extra_document = extra_document.split(" ")

            # Remove stopwords from document 
            extra_document = remove_stopwords(extra_document, stopwords_list)
            extra_document = [token.lower() for token in extra_document]
            
            extra_document = ' '.join(extra_document)
        
            # Clean document 
            extra_document = clean_entry(extra_document)

            # Tokenize document 
            tokens = extra_document.split(" ")

            relevant_docs_processed.append(tokens)

        scores = compute_relevance_feedback_scores(inverted_index, query, relevant_docs_processed, idf, doc_ids, norms, Lengths)
        top_k_docs = get_top_k_documents(doc_ids, scores, 50)        
    
    elif method == 'discounted-relevance-feedback':
        scores = compute_tf_idf_scores(inverted_index, query, idf, doc_ids, norms, Lengths)
        relevant_docs = get_top_k_documents(doc_ids, scores, 5)
        stopwords_list = read_stopwords_file('stopwords.txt')
        relevant_docs_processed = []
        for (relative_index, document_id, score) in relevant_docs:
            extra_document = corpus[relative_index]
            # Clean document 
            extra_document = extra_document.split(" ")

            # Remove stopwords from document 
            extra_document = remove_stopwords(extra_document, stopwords_list)
            extra_document = [token.lower() for token in extra_document]
            
            extra_document = ' '.join(extra_document)
        
            # Clean document 
            extra_document = clean_entry(extra_document)

            # Tokenize document 
            tokens = extra_document.split(" ")

            relevant_docs_processed.append(tokens)

        scores = compute_discounted_relevance_feedback_scores(inverted_index, query, relevant_docs_processed, idf, doc_ids, norms, Lengths)
        top_k_docs = get_top_k_documents(doc_ids, scores, 50)    

    return top_k_docs 

        

In [48]:
corpus = split_into_documents(CORPUS_PATH)
doc_ids = collect_document_ids(corpus)

with open(QUERY_PATH) as f:
    query_string = f.read()
queries = re.split('</top>\n', query_string)[:-1]

# corpus = corpus[:500]
# docs_ids = doc_ids[:500]

inverted_index, Lengths = create_inverted_index(corpus, doc_ids)

In [11]:
norms = get_document_norm(corpus)

In [41]:
np.median(norms)

271.9669097456896

In [64]:
idf = calculate_idf(inverted_index, TOTAL_NUMBER_OF_DOCUMENTS)

In [105]:
log_string = ''
for query in queries:
    query, query_id = process_query(query)
    top_k_docs = retrieve_documents('tf-idf', corpus, query, inverted_index, idf, doc_ids, norms, Lengths)
    log_string += generate_log_string(top_k_docs, query_id)


In [102]:
log_string = ''
for query in queries:
    query, query_id = process_query(query)
    top_k_docs = retrieve_documents('relevance-feedback', corpus, query, inverted_index, idf, doc_ids, norms, Lengths)
    log_string += generate_log_string(top_k_docs, query_id)


KeyboardInterrupt: 

In [100]:
with open('../logs/relevance_feedback_log_file.txt', 'w') as f:
    f.write(log_string)

In [103]:
log_string = ''
for query in queries:
    query, query_id = process_query(query)
    top_k_docs = retrieve_documents('discounted-relevance-feedback', corpus, query, inverted_index, idf, doc_ids, norms, Lengths)
    log_string += generate_log_string(top_k_docs, query_id)


In [104]:
with open('../logs/discounted_relevance_feedback_log_file_alpha_7.txt', 'w') as f:
    f.write(log_string)