In [10]:
import re 
import string 
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
import csv
import sys 
sys.path.append('..')
import json 
import numpy as np 
from src.data_processing import collect_document_ids, split_into_documents, clean_entry, remove_stopwords, read_stopwords_file
from src.inverted_index import create_inverted_index, calculate_idf, save_inverted_index
from src.config import TOTAL_NUMBER_OF_DOCUMENTS, STARTING_DOCUMENT_INDEX
from src.document_ranking import compute_tf_idf_scores

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saaket/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
STARTING_DOCUMENT_INDEX = 54711

In [2]:
example_string = '''.I 54711
.U
88000001
.S
Alcohol Alcohol 8801; 22(2):103-12
.M
Acetaldehyde/*ME; Buffers; Catalysis; HEPES/PD; Nuclear Magnetic Resonance; Phosphates/*PD; Protein Binding; Ribonuclease, Pancreatic/AI/*ME; Support, U.S. Gov't, Non-P.H.S.; Support, U.S. Gov't, P.H.S..
.T
The binding of acetaldehyde to the active site of ribonuclease: alterations in catalytic activity and effects of phosphate.
.P
JOURNAL ARTICLE.
.W
Ribonuclease A was reacted with [1-13C,1,2-14C]acetaldehyde and sodium cyanoborohydride in the presence or absence of 0.2 M phosphate. After several hours of incubation at 4 degrees C (pH 7.4) stable acetaldehyde-RNase adducts were formed, and the extent of their formation was similar regardless of the presence of phosphate. Although the total amount of covalent binding was comparable in the absence or presence of phosphate, this active site ligand prevented the inhibition of enzymatic activity seen in its absence. This protective action of phosphate diminished with progressive ethylation of RNase, indicating that the reversible association of phosphate with the active site lysyl residue was overcome by the irreversible process of reductive ethylation. Modified RNase was analysed using 13C proton decoupled NMR spectroscopy. Peaks arising from the covalent binding of enriched acetaldehyde to free amino groups in the absence of phosphate were as follows: NH2-terminal alpha amino group, 47.3 ppm; bulk ethylation at epsilon amino groups of nonessential lysyl residues, 43.0 ppm; and the epsilon amino group of lysine-41 at the active site, 47.4 ppm. In the spectrum of RNase ethylated in the presence of phosphate, the peak at 47.4 ppm was absent. When RNase was selectively premethylated in the presence of phosphate, to block all but the active site lysyl residues and then ethylated in its absence, the signal at 43.0 ppm was greatly diminished, and that arising from the active site lysyl residue at 47.4 ppm was enhanced. These results indicate that phosphate specifically protected the active site lysine from reaction with acetaldehyde, and that modification of this lysine by acetaldehyde adduct formation resulted in inhibition of catalytic activity.
.A
Mauch TJ; Tuma DJ; Sorrell MF.
'''


In [3]:
doc_ids = collect_document_ids('../data/ohsumed.88-91')

In [4]:
corpus = split_into_documents('../data/ohsumed.88-91')

In [5]:
inverted_index, Lengths = create_inverted_index(corpus, doc_ids)

In [6]:
idf = calculate_idf(inverted_index, TOTAL_NUMBER_OF_DOCUMENTS)

In [66]:
def compute_document_scores(inverted_index, query, idf, doc_ids, Lengths):
    scores = np.zeros(len(doc_ids))
    query = query.split(' ')
    tf_query = Counter(query)
    for t in query:
        w_tq = tf_query[t] * idf[t]
        postings_list = inverted_index[t]
        for (d, tf_td) in postings_list:
            scores[int(d)-int(STARTING_DOCUMENT_INDEX)] += tf_td * w_tq 
    for i in range(len(doc_ids)):
        scores[i] /= Lenghts[str(int(STARTING_DOCUMENT_INDEX)+i)]
    
    return scores 
        

In [8]:
query ="pathophysiology treatment disseminated intravascular coagulation"

In [14]:
STARTING_DOCUMENT_INDEX

54711

In [15]:
def compute_tf_idf_scores_(inverted_index, query, idf, doc_ids, Lengths):
    scores = np.zeros(len(doc_ids))
    query = query.split(' ')
    tf_query = Counter(query)
    for t in query:
        w_tq = tf_query[t] * idf[t]
        postings_list = inverted_index[t]
        for (d, tf_td) in postings_list:
            scores[int(d)-STARTING_DOCUMENT_INDEX] += tf_td * w_tq 
    for i in range(len(doc_ids)):
        scores[i] /= Lengths[str(STARTING_DOCUMENT_INDEX+i)]
    
    return scores 
        

In [16]:
scores = compute_tf_idf_scores_(inverted_index, query, idf, doc_ids, Lengths)

In [24]:
def get_top_k_document(doc_ids, scores, k):
    idx = (-scores).argsort()[:k]
    top_k_docs = [STARTING_DOCUMENT_INDEX + i for i in idx]
    return top_k_docs

In [25]:
top_k_docs = get_top_k_document(doc_ids, scores, 50)

In [74]:
def process_query(query):
    patterns = ['<num>.*\n', '<desc>.*\n', '<.*>']
    for pattern in patterns: 
        query = re.sub(pattern, ' ', query)
    query = clean_entry(query)
    query = query.split(' ')
    query = [q.lower() for q in query]
    stop_words_list = read_stopwords_file('stopwords.txt')
    query = remove_stopwords(query, stop_words_list)
    query = ' '.join(query)
    return query

In [53]:
with open('../data/query.ohsu.1-63') as f:
    queries = f.read()

In [54]:
queries = re.split('</top>\n', queries)

In [70]:
queries[0]

'<top>\n<num> Number: OHSU1\n<title> 60 year old menopausal woman without hormone replacement therapy\n<desc> Description:\nAre there adverse effects on lipids when progesterone is given with estrogen replacement therapy\n'

In [76]:
query = process_query(queries[0]); query

'year old menopausal woman without hormone replacement therapy adverse effects lipids progesterone given estrogen replacement therapy'

In [77]:
scores = compute_tf_idf_scores_(inverted_index, query, idf, doc_ids, Lengths)

In [78]:
top_50 = get_top_k_document(doc_ids, scores, 50)

In [84]:
corpus[237978 - STARTING_DOCUMENT_INDEX]

'.U\n90300564\n.S\nJAMA 9010; 264(3):314, 317\n.M\nForecasting; Gene Therapy/*/TD; Human.\n.T\nHuman gene therapy in coming months? [news]\n.P\nNEWS.\n.A\nMarwick C.\n'

In [None]:
def create_log_file(top_k_docs):