## CSC 575 HW#4
### Programming: Vector-space Retrieval & Evaluation -- Partially filled code

### (1) Step 1: Load Inverted Index (H) and compute DocLen (DL).

In [1]:
import csv
import math

tindexfile = 'medline_term_index.csv'
invindexfile = 'medline_inverted_index.csv'
dindexfile = 'medline_doc_index.csv'

# Number of documents in the corpus (hard-coded for this corpus)
N = 1033

# Major data structures
H_invindex = {} # inverted index; term -> (idf, L:hashmap of (docID . tf))
DL_doclen = {}  # document lengths; docID -> len

## (1) Read the term index file and populate the invindex first
tid2term_map = {} # temporary storage to hold mappings of termID -> term

fin = open(tindexfile, 'r', encoding='utf-8')
reader = csv.reader(fin, delimiter='\t')
for line in reader:
    term = line[0]    # term string
    termID = line[1]  # termID
    df = int(line[2]) # document frequency
    idf = math.log10(N/df) # idf
    # record term -> (idf, emptyL) in H
    H_invindex[term] = (idf, dict())
    # record termID -> term 
    tid2term_map[termID] = term 
fin.close()

## (2) Read the inverted index file and add postings lists in H.
## Also compute document lengths too, incrementally -- and record in DL.
fin = open(invindexfile, 'r')
reader = csv.reader(fin, delimiter='\t')
for line in reader:
    termID = line[0]
    idx = 1
    while idx < (len(line)-1):
        docID = line[idx]
        tf = int(line[idx+1]) # raw tf of the term in this document
        # Record docID -> tf in term's L
        L = (H_invindex[tid2term_map[termID]])[1]
        L[docID] = tf  # docID -> raw term frequency
        
        # Accumulate the component vector length for the document
        tfidf = tf * (H_invindex[tid2term_map[termID]])[0] # tf * idf
        tfidfsq = math.pow(tfidf, 2.0)
        if docID in DL_doclen:
            DL_doclen[docID] += tfidfsq
        else:
            DL_doclen[docID] = tfidfsq
        #
        idx += 2
fin.close()

# Fix the DL entries by applying sqrt to make vector length.
for docID in DL_doclen.keys():
    val = DL_doclen[docID]
    DL_doclen[docID] = math.sqrt(val)

    
print ('Total # terms: %d' % len(H_invindex))
for term in ['pentobarbit', 'defici', 'treatment']:
    print (' - Entry for \'%s\': df=%s, idf=%s' % (term, len(H_invindex[term][1]), H_invindex[term][0]))

print ('\nTotal # documents: %d' % len(DL_doclen))
for docID in ['59', '1033']:
    print (' - Vector len for Doc %s = %s' % (docID, DL_doclen[docID]))


Total # terms: 11463
 - Entry for 'pentobarbit': df=4, idf=2.412040330191658
 - Entry for 'defici': df=39, idf=1.4230357144931214
 - Entry for 'treatment': df=172, idf=0.7785718746120717

Total # documents: 1033
 - Vector len for Doc 59 = 13.811725366348801
 - Vector len for Doc 1033 = 31.163653356034512


### (2) Step 2: Queries as Vectors

In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

queryfile = 'medline.query'

# A list of queries. Each Query is a tuple, (qID, Q:term->tf map)
Queries_list = []

fin = open(queryfile, 'r', encoding='utf-8')#'iso-8859-1')
porter = nltk.PorterStemmer()

for line in fin:
    matchObj = re.match(r'^(\d+)\s+(.*)', line)
    if not matchObj:
        print ("ERROR with line -- %s" % line)
    else:
        queryID = matchObj.group(1) # queryID
        text = matchObj.group(2)    # query string (ignoring sentences)

        # process text string -- same processing as one applied to documents.
        tokens = word_tokenize(text.lower())
        terms = [porter.stem(w) for w in tokens if w not in stopwords.words('english') and len(w) > 1] # (**)
        # term frequencies of the terms in this query are obtained by NLTK's FreqDist
        fdist = nltk.FreqDist(terms)
        # append the Query in the list
        Queries_list.append((queryID, dict(fdist)))
fin.close()

print ('Total # queries: %d' % len(Queries_list))
for qid in [1, 21]:
    print (' - Query %s: %s' % (Queries_list[qid][0], Queries_list[qid][1]))

Total # queries: 30
 - Query 2: {'relationship': 1, 'blood': 1, 'cerebrospin': 1, 'fluid': 1, 'oxygen': 1, 'concentr': 1, 'partial': 1, 'pressur': 1, 'method': 1, 'interest': 1, 'polarographi': 1}
 - Query 22: {'mycoplasma': 1, 'infect': 1, 'presenc': 1, 'embryo': 1, 'fetu': 1, 'newborn': 1, 'infant': 1, 'anim': 1, 'pregnanc': 1, 'gynecolog': 1, 'diseas': 1, 'relat': 1, 'chromosom': 2, 'abnorm': 1}


In [8]:
print ('Total # terms: %d' % len(H_invindex))
for term in ['pentobarbit']:
    print (' - Entry for \'%s\': df=%s, idf=%s' % (term, len(H_invindex[term][1]), H_invindex[term][0]))
    print(H_invindex[term])
    print()

print ('\nTotal # documents: %d' % len(DL_doclen))
for docID in ['59', '1033']:
    print (' - Vector len for Doc %s = %s' % (docID, DL_doclen[docID]))

Queries_list

Total # terms: 11463
 - Entry for 'pentobarbit': df=4, idf=2.412040330191658
(2.412040330191658, {'187': 1, '301': 1, '416': 1, '419': 1})


Total # documents: 1033
 - Vector len for Doc 59 = 13.811725366348801
 - Vector len for Doc 1033 = 31.163653356034512


[('1', {'crystallin': 1, 'len': 1, 'vertebr': 1, 'includ': 1, 'human': 1}),
 ('2',
  {'relationship': 1,
   'blood': 1,
   'cerebrospin': 1,
   'fluid': 1,
   'oxygen': 1,
   'concentr': 1,
   'partial': 1,
   'pressur': 1,
   'method': 1,
   'interest': 1,
   'polarographi': 1}),
 ('3', {'electron': 1, 'microscopi': 1, 'lung': 1, 'bronchi': 1}),
 ('4', {'tissu': 1, 'cultur': 1, 'lung': 1, 'bronchial': 1, 'neoplasm': 1}),
 ('5',
  {'cross': 1,
   'fatti': 2,
   'acid': 2,
   'placent': 1,
   'barrier': 1,
   'normal': 1,
   'level': 1,
   'placenta': 1,
   'fetu': 1}),
 ('6',
  {'ventricular': 1,
   'septal': 1,
   'defect': 1,
   'occur': 1,
   'associ': 1,
   'aortic': 1,
   'regurgit': 1}),
 ('7',
  {'radioisotop': 1,
   'heart': 2,
   'scan': 1,
   'mainli': 1,
   'use': 3,
   'diagnosi': 1,
   'pericardi': 2,
   'effus': 1,
   'also': 1,
   'studi': 1,
   'tumor': 1,
   'enlarg': 1,
   'aneurysm': 1,
   'thicken': 1,
   'technetium': 1,
   'rihsa': 1,
   'radioact': 1,
   'hippur'

## (3) Retrieval and Ranking -- Step 3,4,5 for each Query

### - For each query, follow Step 3,4,5 of the Vector Space Retrieval Algorithm and obtain a ranked list of retrieved documents (sorted by the cosine measure) -- 'R' in the algorithm.  
### - Then save the ranked lists in a list (in the same order of the query) -- to be used in the next step/Evaluation.
### - (**) Also, WRITE the ranked lists to an output file.  See the homework page for details.

## (4) Evaluation -- Compute MAP
### - Read the relevancy answers from the file "medline.rel".
### - Compare the ranked lists with the anwers, and compute the MAP score.
### - (**) Also print the MAP score (to the terminal).