In [2]:
import re 
import string 
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
import csv
import sys 
sys.path.append('..')
import json 
import numpy as np 
from src.data_processing import collect_document_ids, split_into_documents, clean_entry, remove_stopwords, read_stopwords_file
from src.inverted_index import create_inverted_index, calculate_idf, save_inverted_index
from src.config import TOTAL_NUMBER_OF_DOCUMENTS, STARTING_DOCUMENT_INDEX
from src.document_ranking import compute_tf_idf_scores
import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saaket/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
STARTING_DOCUMENT_INDEX = 54711

In [8]:
example_string = '''.I 54711
.U
88000001
.S
Alcohol Alcohol 8801; 22(2):103-12
.M
Acetaldehyde/*ME; Buffers; Catalysis; HEPES/PD; Nuclear Magnetic Resonance; Phosphates/*PD; Protein Binding; Ribonuclease, Pancreatic/AI/*ME; Support, U.S. Gov't, Non-P.H.S.; Support, U.S. Gov't, P.H.S..
.T
The binding of acetaldehyde to the active site of ribonuclease: alterations in catalytic activity and effects of phosphate.
.P
JOURNAL ARTICLE.
.W
Ribonuclease A was reacted with [1-13C,1,2-14C]acetaldehyde and sodium cyanoborohydride in the presence or absence of 0.2 M phosphate. After several hours of incubation at 4 degrees C (pH 7.4) stable acetaldehyde-RNase adducts were formed, and the extent of their formation was similar regardless of the presence of phosphate. Although the total amount of covalent binding was comparable in the absence or presence of phosphate, this active site ligand prevented the inhibition of enzymatic activity seen in its absence. This protective action of phosphate diminished with progressive ethylation of RNase, indicating that the reversible association of phosphate with the active site lysyl residue was overcome by the irreversible process of reductive ethylation. Modified RNase was analysed using 13C proton decoupled NMR spectroscopy. Peaks arising from the covalent binding of enriched acetaldehyde to free amino groups in the absence of phosphate were as follows: NH2-terminal alpha amino group, 47.3 ppm; bulk ethylation at epsilon amino groups of nonessential lysyl residues, 43.0 ppm; and the epsilon amino group of lysine-41 at the active site, 47.4 ppm. In the spectrum of RNase ethylated in the presence of phosphate, the peak at 47.4 ppm was absent. When RNase was selectively premethylated in the presence of phosphate, to block all but the active site lysyl residues and then ethylated in its absence, the signal at 43.0 ppm was greatly diminished, and that arising from the active site lysyl residue at 47.4 ppm was enhanced. These results indicate that phosphate specifically protected the active site lysine from reaction with acetaldehyde, and that modification of this lysine by acetaldehyde adduct formation resulted in inhibition of catalytic activity.
.A
Mauch TJ; Tuma DJ; Sorrell MF.
'''


In [3]:
corpus = split_into_documents('../data/ohsumed.88-91')

In [4]:
def collect_document_ids(corpus):
    '''collect a single list of document_ids given the database file'''
    document_ids = []
    for document in corpus: 

        pattern = "\.U\n(.*?)\n"
        doc_id_find = re.search(pattern, document)
        if doc_id_find:
            doc_id = doc_id_find.group(1)
            document_ids.append(doc_id)
    return document_ids

In [5]:
doc_ids = collect_document_ids(corpus)

In [7]:
def create_inverted_index(corpus, doc_ids):
    '''Created inverted index for all unique terms with (document_ids, term_frequncies) in posting list'''
    # Create empty inverted index 
    inverted_index = {}
    document_lengths = np.zeros(len(corpus))

    # Read stopwords from a stopword file 
    stopwords_list = read_stopwords_file('stopwords.txt')

    # Iterate over the entire courpus of list of documents 
    for i, document in enumerate(corpus):
        doc_id = i

        # Clean document 
        document = clean_entry(document)

        # Tokenize document 
        tokens = document.split(" ")

        # Lowercase all tokens 
        tokens = [token.lower() for token in tokens]

        # Remove stopwords from document 
        tokens = remove_stopwords(tokens, stopwords_list)

        # Calculate and store document lengths 
        document_lengths[doc_id] = len(tokens)

        # Create a dictionary which stores term frequencies 
        term_frequencies = Counter(tokens)

        # Euclidian normalize the term frequencies 
        denom = np.sum(np.array([(count)**2 for count in term_frequencies.values()]))
        denom = np.sqrt(denom)            

        # Iterate over all unique terms 
        for term in term_frequencies.keys():
            term_frequencies[term] /= denom
            # If the term already exists in inverted index, append the current (doc_id, term_frequency) to the postings list
            if term in inverted_index:
                inverted_index[term].append((doc_id, term_frequencies[term]))
            # If the term does not exist in inverted index, create a entry with the term as key nd add (doc_id, term_frequency) to the posting list
            else:
                inverted_index[term] = [(doc_id, term_frequencies[term])]
    return inverted_index, document_lengths

In [8]:
inverted_index, Lengths = create_inverted_index(corpus, doc_ids)

In [9]:
idf = calculate_idf(inverted_index, TOTAL_NUMBER_OF_DOCUMENTS)

In [11]:
def compute_tf_idf_scores(inverted_index, query, idf, doc_ids, Lengths):
    scores = np.zeros(len(doc_ids))
    query = query.split(' ')
    tf_query = Counter(query)
    for t in query:
        if t in inverted_index:
            w_tq = tf_query[t] * idf[t]
            postings_list = inverted_index[t]
            for (d, tf_td) in postings_list:
                scores[d] += tf_td * w_tq 
    for i in range(len(doc_ids)):
        scores[i] /= Lengths[i]
    
    return scores 

In [15]:
def get_top_k_documents(doc_ids, scores, k):
    idx = (-scores).argsort()[:k]
    top_k_docs = [(doc_ids[i], scores[i]) for i in idx]
    return top_k_docs

In [25]:
top_k_docs = get_top_k_documents(doc_ids, scores, 50)

In [16]:
def process_query(query):
    patterns = ['<num>.*\n', '<desc>.*\n', '<.*>']
    query_id_find = re.search('<num>\sNumber:\s(.+?)\n', query)
    if query_id_find:
        query_id = query_id_find.group(1)
    for pattern in patterns: 
        query = re.sub(pattern, ' ', query)
    query = clean_entry(query)
    query = query.split(' ')
    query = [q.lower() for q in query]
    stop_words_list = read_stopwords_file('stopwords.txt')
    query = remove_stopwords(query, stop_words_list)
    query = ' '.join(query)
    return query, query_id

In [17]:
with open('../data/query.ohsu.1-63') as f:
    queries = f.read()

In [18]:
queries = re.split('</top>\n', queries)

In [28]:
def retrieve_documents(method, query, inverted_index, idf, doc_ids, Lengths):
    if method == 'tf-idf':
        query, query_id = process_query(query)
        scores = compute_tf_idf_scores(inverted_index, query, idf, doc_ids, Lengths)
        top_k_docs = get_top_k_documents(doc_ids, scores, 1000)
        log_string = ''
        for rank, (doc_id, score)  in enumerate(top_k_docs):
            log_string += f'{query_id} 0 {doc_id} {rank+1} {score} tf-idf\n'
    return log_string 

        

In [21]:
query = queries[0]; query 

'<top>\n<num> Number: OHSU1\n<title> 60 year old menopausal woman without hormone replacement therapy\n<desc> Description:\nAre there adverse effects on lipids when progesterone is given with estrogen replacement therapy\n'

In [22]:
l_string = retrieve_documents('tf-idf', query, inverted_index, idf, doc_ids, Lengths)

In [29]:
queries = queries[:-1]

In [30]:
log_string = ''
for query in queries:
    log_string += retrieve_documents('tf-idf', query, inverted_index, idf, doc_ids, Lengths)

In [27]:
with open('log_file.txt', 'w') as f:
    f.write(log_string)