## Term Index Generator

In [53]:
import re
import string
import pickle
import numpy
from collections import defaultdict
import time
import os
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import import_ipynb
import clir_files as files

## Global Variables

In [54]:
TEST_PICKLE_READ_WRITE = False
global PUNCTUATION
global NUM_BYTES
global BYTE_ORDER
global SIGNED 
global STOP_WORDS
global RUN_TESTS
RUN_TESTS = True
TERM_COUNT = 0
SIGNED = False
BYTE_ORDER = "big"
NUM_BYTES = 4

PUNCTUATION = set(string.punctuation)

EN_STOP_WORDS = set(stopwords.words("english"))

def increment_term_count():
    global TERM_COUNT
    TERM_COUNT = TERM_COUNT + 1

## Models

In [55]:
class Corpus():
    """
    This object represents the corpus.
    Attributes:
        docs_processed (int): Number of documents processed.
        lexicon (dict): Dictionaryh object representing lexicon
    """
    def __init__(self):
        self.docs_processed = 0
        self.lexicon = {}
        self.collection_freq = 0

In [56]:
class Posting():
    """
    This object represents one posting in a postings list.
    Attributes:
        doc_id (int): Document's id.
        term_freq (int): Number of times the term occurs in the document.
    """
    def __init__(self, doc_id, term_freq):
        self.doc_id = doc_id
        self.term_freq = term_freq
    def __str__(self):
        return f"({self.doc_id},{self.term_freq})"
    def __repr__(self):
        return self.__str__()

In [57]:
class Term():
    """
    This object represents a term.
    Attributes:
        offset (int): The beginning offset for the postings list in a binary file.
        id (str): The term id for this object
        doc_freq (int): Number of documents containing this term.
    """
    def __init__(self, term_id):
        self.offset = 0
        self.id = term_id
        self.doc_freq = 0
        self.idf = -1
    def __str__(self):
        return f"doc_freq: {self.doc_freq}; offset: {self.offset}; id: {self.id}; idf: {self.idf}"
    def __repr__(self):
        return self.__str__()

## File IO Operations

In [58]:
def read_file(file_name):
    """
    Reads file.
    Parameters:
        file_name (str): The filename for file to be read.
    Returns:
        content (str): contents of the file.
    """
    with open(file_name) as file:
        content = file.read()
        return content

In [59]:
def write_lexicon_to_file(lexicon, filename):
    """
    Write lexicon to a file.
    Parameters:
        lexicon (dict{str, Term}): Object representing the lexicon.
        filename (str): Filename for the file created.
    Returns:
        N/A
    """
    pickle.dump(lexicon, open(filename, "wb" ))
    
def read_lexicon_from_file(filename):
    """
    Read lexicon from file.
    Parameters:
        filename (str): Filename for the file read.
    Returns:
        Lexicon object loaded from file
    """
    return pickle.load(open(filename, "rb" )) 

if TEST_PICKLE_READ_WRITE:
    test_obj = CorpusStats()
    test_obj.num_paras = 7
    write_corpus_to_file(test_obj, "read_write_test.pk")
    read_test = read_corpus_from_file("read_write_test.pk")
    assert read_test.num_paras == 7

In [60]:
def binary_file_write(lexicon, postings, bin_file_name):
    """
    Write postings list to two binary files and use offset to track the beginning of the binary file.
    Attributes:
        lexicon (dict{str: Term}): A dict representing the lexicon.
        postings (dict{str: list[Posting]}): A list of Posting objects.
        bin_file_name (str): Filename for binary file containing interleaving doc_ids and term_freqs as 32-bit ints.
    Returns:
        N/A
    """
    bin_file = open(bin_file_name, "wb")
    
    assert bin_file.tell() == 0
    offset = 0
    for key, val in lexicon.items():
        for posting in postings[val.id]:
            bin_file.write((int(posting.doc_id)).to_bytes(NUM_BYTES, byteorder=BYTE_ORDER, signed=SIGNED))
            bin_file.write((int(posting.term_freq)).to_bytes(NUM_BYTES, byteorder=BYTE_ORDER, signed=SIGNED))
        val.offset = offset
        offset = bin_file.tell()
    
    bin_file.close()

In [61]:
def rank_top_100_scores(scores, filename):
    file = open(filename, "a")
    for qid, query_scores in scores.items():
        try:
            top_100 = dict(sorted(query_scores.items(), key=lambda k:k[1], reverse = True)[0:100])
        except IndexError:
            top_100 = dict(sorted(query_scores.items(), key=lambda k:k[1], reverse = True))
        for rank, (doc_id, doc_score) in enumerate(top_100.items(), 1):
            file.write(f"{qid} Q0 {doc_id} {rank} {doc_score:.6f} skasim3\n")
    file.close()

## Tokenization Operations

In [62]:
def separate_punctuation(doc):
    """
    Strips punctuation from a document.
    Parameters:
        doc (str): A string representing the document.
    Returns:
        stripped (str): A string that has been stripped of punctuation.
        
    """    
    separated = ""
    for ch in doc:
        if ch in PUNCTUATION and ch != ".":
            separated = f"{separated} {ch} "
        else:
            separated = f"{separated}{ch}"
    return separated.strip()

if RUN_TESTS:
    assert separate_punctuation("That's my mother-in-law") == "That ' s my mother - in - law"
    assert separate_punctuation("""NAC has developed a National HIV/AIDS/STI/TB Intervention Strategic Plan (2002-2005) that aims to reduce the HIV prevalence rate among Zambians from 19.3% to 11.7% and improve the health status of people living with HIV/AIDS by 2005.""") == """NAC has developed a National HIV / AIDS / STI / TB Intervention Strategic Plan  ( 2002 - 2005 )  that aims to reduce the HIV prevalence rate among Zambians from 19.3 %  to 11.7 %  and improve the health status of people living with HIV / AIDS by 2005."""

In [63]:
def strip_doc(doc):
    """
    Removes new lines, multiple periods (e.g., mercedes...awesome), punctuation, multiple spaces (e.g., mercedes    awesome), and lowers the case of the provided document.
    Parameters:
        doc (str): A string representing the document.
    Returns:
        multi_space_stripped_doc (str): A string stripped of new lines, multiple periods, punctuation, multiple spaces with case lowered.
    """
    stripped_doc = doc.strip("\n").lower()
    multi_period_stripped_doc = re.sub("\.\.+", " ", stripped_doc)
    punctuation_separated_doc = separate_punctuation(multi_period_stripped_doc)
    multi_space_stripped_doc = re.sub(" +", " ", punctuation_separated_doc)
    new_line_stripped_doc = re.sub("\n"," ", multi_space_stripped_doc)
    return new_line_stripped_doc.strip()

if RUN_TESTS:
    assert strip_doc("To Be? Or* #not To +% be T&HAT I^s...the 99$ question!") == "to be ? or * # not to + % be t & hat i ^ s the 99 $ question !"

In [64]:
def split_into_tokens(doc):
    """
    Splits document based on space character into a list of words/terms and removes any digits so that they are not counted as a word, e.g. "20"
    Parameters:
        doc (str): A string.
    Returns:
        terms (list[str]): A list of strings, which are the individual word/term.
    """
    tokens = word_tokenize(doc)
    return [t for t in tokens if t not in PUNCTUATION and t not in EN_STOP_WORDS]
if RUN_TESTS:
    assert split_into_tokens("to be or not to be that is the question") == ['question']
    assert split_into_tokens("99 bottles on the wall") == ['99', 'bottles', 'wall']

In [13]:
def capture_and_remove_doc_id(doc, is_doc=True):
    """
    Utilize regex to capture document id and remove document id from the text, so that it is not processed.
    Parameters:
        doc (str): A string.
    Returns:
        doc_id (str): The document id.
        modified_doc (str): Document with the pattern <p id = xx> removed.
    """
    if is_doc:
        pattern = r"< p id = \d+ >"
    else:
        pattern = r"< q id = \d+ >"
    doc_id = re.findall(r"\d+", doc)
    modified_doc = re.sub(pattern, "", doc).strip("\n")
    return doc_id[0], (re.sub(" +", " ", modified_doc)).strip()

if RUN_TESTS:
    assert capture_and_remove_doc_id("99 bottles on the wall")[1] == "99 bottles on the wall"
    assert capture_and_remove_doc_id("< p id = 2 > 99 bottles on the wall")[0][0] == "2"
    assert capture_and_remove_doc_id("< p id = 27 > 99 bottles on the wall")[1] == "99 bottles on the wall"
    assert capture_and_remove_doc_id("< q id = 1 > 99 bottles on the wall", is_doc=False)[1] == "99 bottles on the wall"

## Lexicon Enrichment Operations

In [65]:
def enrich_lexicon(corpus, postings, words, doc_id):
    """
    Enrich lexicon with list of words and document_id provided:
        * If word is not in the lexicon, the word is added to lexicon, a Posting object is instantiated, and doc_freq is incremented by 1.
        * If word is in lexicon, a check is done to see if the word reoccurs in the last indexed document:
            - If the word reoccurs in the same document, the previously saved term_freq count for the document is incremented by 1.
            - If the word does not reoccur in the same document, but is in the lexicon, then a new Posting object is created and doc_freq is incremented by 1.
    Parameters:
        lexicon (Corpus): An object representing the corpus.
        postings (dict{str: list[Posting]}): A list of Posting objects.
        words (list[str]): A list of words.
        doc_id: The document id of the document from, which words was retrieved.
    Returns:
        lexicon (dict{str: Term}): Updated lexicon.
        postings (dict{str: list[Posting]}): Updated postings.
    """

    for word in words:
        corpus.collection_freq += 1
        if word not in corpus.lexicon:
            posting = Posting(doc_id, 1)
            increment_term_count()
            term = Term(TERM_COUNT)
            term.doc_freq = 1
            postings[term.id] = [posting]
            corpus.lexicon[word] = term
        else:
            prev_indexed_term_id = corpus.lexicon[word].id
            last_doc_id = postings[prev_indexed_term_id][-1].doc_id
            if last_doc_id == doc_id:
                postings[prev_indexed_term_id][-1].term_freq += 1
            else:
                posting = Posting(doc_id, 1)
                corpus.lexicon[word].doc_freq += 1
                term_id = corpus.lexicon[word].id
                postings[term_id].append(posting)

    return corpus   

if RUN_TESTS:
    test_corpus = Corpus()
    test_postings = {}
    test_words = ["you", "are", "you"]
    test_enriched = enrich_lexicon(test_corpus, test_postings, test_words, 77)
    assert test_enriched.lexicon["you"].doc_freq == 1
    t1 = test_enriched.lexicon["you"].id
    assert str(test_postings[t1][0]) == "(77,2)"
    assert test_enriched.lexicon["are"].doc_freq == 1
    t2 = test_enriched.lexicon["are"].id
    assert str(test_postings[t2][0]) == "(77,1)"
    test_new_words = ["you", "thirsty"]
    test_enriched_update = enrich_lexicon(test_corpus, test_postings, test_new_words, 32)
    assert test_enriched_update.lexicon["you"].doc_freq == 2
    assert str(test_postings[t1][0]) == "(77,2)"
    assert str(test_postings[t1][1]) == "(32,1)"
    assert test_enriched_update.lexicon["thirsty"].doc_freq == 1
    t3 = test_enriched.lexicon["thirsty"].id
    assert str(test_postings[t3][0]) == "(32,1)"

In [66]:
def sort_postings(postings):
    """
    Sort postings list by doc_id.
    Attributes:
        postings(list[Posting]): A list of Posting objects.
    Returns:
        postings(list[Posting])
    """
    for k, post in postings.items():
        post.sort(key=lambda k:k.doc_id, reverse=False)
    return postings

In [67]:
def sort_terms(lexicon):
    """
    Sort terms in a lexicon in alphabetical order.
    Attributes:
        lexicon(dict{str: Term}). A dictionary representing the lexicon.
    Returns:
        sorted lexicon
    """
    return dict(sorted(lexicon.items()))

In [68]:
def get_postings_list_for_term(word, lexicon, binary_file):
    """
    Retrieves posting list for a term.
    Attributes:
        word (str): The term being retrieved
        lexicon (dict{str: Term}): A dict representing the lexicon.
        doc_id_file (str): Filename for binary file containing doc_ids as 32-bit ints.
        term_freq_file (str): Filename for binary file containing term_freqs as 32-bit ints.
    Returns:
        postings_list (list[Posting]): A list of Posting objects for a term.
    """
    binary_file = open(binary_file, "rb")
    
    try:
        postings_list = []
        term = lexicon[word]
        offset = term.offset
        binary_file.seek(int(term.offset))
        for _ in range(0, term.doc_freq):
            bytes_read_doc_id = binary_file.read(NUM_BYTES)
            bytes_read_term_freq = binary_file.read(NUM_BYTES)
            doc_id = int.from_bytes(bytes_read_doc_id, byteorder=BYTE_ORDER,signed=SIGNED)
            term_freq = int.from_bytes(bytes_read_term_freq, byteorder=BYTE_ORDER,signed=SIGNED)
            posting = Posting(doc_id, term_freq)
            postings_list.append(posting)
        return postings_list
    except KeyError as e:
        return []
    
    binary_file.close()

## Query Enrichment Operations

In [69]:
def process_query_file(filename):
    """
    Given the query filename, process and tokenize query terms in the exact same manner as the documents.
    Attributes:
        filename (str): Filename for the query file.
    Returns:
        queries (dict(int, list[str])): Return a dict for which the key is an int value representing query_id and 
        value is a list of strings representing the terms for that query.
    """
    content = read_file(filename)
    content = re.split("</Q>", content)
    queries_processed = 0
    queries = {}
    for query in content:
        validation_check = re.findall(r"\d+", query)
        try:
            validation_check[0]
            clean_query = strip_doc(query)
            query_id, modified_query = capture_and_remove_doc_id(clean_query, is_doc=False)
            query_terms = split_into_tokens(modified_query)
            queries[query_id] = query_terms
            queries_processed += 1
        except IndexError:
            pass
    return queries

if RUN_TESTS:
    assert str(process_query_file("data/animal.topics.txt")) == "{'1': ['bird', 'cat', 'dog']}"

## Cosine Similarity Calculations

In [70]:
if RUN_TESTS:
    # For testing
    test_binary_file = "TEST_binary.bin"
    tc_lexicon = {}
    red1 = Posting(3, 7)
    red2 = Posting(5, 1)
    red3 = Posting(7, 9)
    redterm = Term(1)
    redterm.offset = 16
    redterm.doc_freq = 3
    tc_lexicon["red"] = redterm
    blue1 = Posting(3, 5)
    blue2 = Posting(7, 2)
    blueterm = Term(2)
    blueterm.offset = 0
    blueterm.doc_freq = 2
    tc_lexicon["blue"] = blueterm
    tc_corpus = Corpus()
    tc_corpus.lexicon = tc_lexicon
    tc_corpus.docs_processed = 5

In [71]:
def calc_idf_for_term(corpus, term):
    """
    Calculate IDF(t) = log_2(N/df(t)) for a term.
    Attributes:
        corpus (Corpus): The Corpus object.
        term (str): String representing one term.
    Returns:
        float representing IDF for a term in lexicon.
    """
    N = corpus.docs_processed
    df_t = corpus.lexicon[term].doc_freq
    return numpy.log2(N/df_t)

if RUN_TESTS:
    assert round(calc_idf_for_term(tc_corpus, "red"), 6) == .736966
    assert round(calc_idf_for_term(tc_corpus, "blue"), 6) == 1.321928

In [72]:
def populate_idf_values_in_corpus(corpus):
    """
    Given a corpus, loop through each term in the lexicon, calculate the term's IDF and update corpus.lexicon[term].idf
    field.
    Attributes:
        corpus (Corpus): A corpus object.
    Returns:
        corpus (Corpus): The corpus.lexicon with the updated IDF values.
    """
    for term in corpus.lexicon:
        corpus.lexicon[term].idf = calc_idf_for_term(corpus, term)
    return corpus

In [73]:
def calc_doc_vector_lengths(corpus, binary_file):
    """
    Calculate vector lengths for a corpus. Since inverted index/postings list for a term is stored in a binary file,
    read binary file and return posting list for a given term.
    """
    doclens = defaultdict(float)
    for term in corpus.lexicon:
        postings_list = get_postings_list_for_term(term, corpus.lexicon, binary_file)
        for posting in postings_list:
            weight = int(posting.term_freq) * calc_idf_for_term(corpus, term)
            doclens[int(posting.doc_id)] += weight * weight
    for doc_id, summ in doclens.items():
        doclens[doc_id] = numpy.sqrt(summ)
    return dict(sorted(doclens.items()))


if RUN_TESTS:
    tc_doclens = calc_doc_vector_lengths(tc_corpus, test_binary_file)
    assert round(tc_doclens[3], 4) == 8.3845
    assert round(tc_doclens[5], 4) == .7370
    assert round(tc_doclens[7], 4) == 7.1402

In [74]:
def calc_query_tfs(corpus, query_terms):
    """
    Calcuate term frequencies of a an array of query terms. If that term exists in the lexicon. If it doesn't exist in the lexicon
    then ignore the term.
    Attributes:
        corpus (Corpus): A Corpus object
        query_terms (list[str]): A list of string objects representing a query term.
    Returns:
        term_tfs (dict[str, int]): A dict object representing the term (str) and its term frequency (int).
    """
    term_tfs = {}
    for term in query_terms:
        if term in corpus.lexicon:
            if term in term_tfs:
                term_tfs[term] += 1
            else:
                term_tfs[term] = 1
    return term_tfs

if RUN_TESTS:
    qtc = "want blue blue blue red"
    qtc = qtc.split()
    qtc_tfs = calc_query_tfs(tc_corpus, qtc)
    assert str(qtc_tfs) == "{'blue': 3, 'red': 1}"

In [75]:
def calc_query_vector_length(corpus, query_tfs):
    """
    Given the corpus and the query term frequencies, calculate the query vector length.
    Attributes:
        corpus (Corpus): A Corpus object.
        query_tfs (dict): dict[str, int]): A dict object representing the term (str) and its term frequency (int).
    Returns:
        float representing the query vector length.
    """
    sum_of_sqrs = 0
    for term, tf_q in query_tfs.items():
        weight = tf_q * calc_idf_for_term(corpus, term)
        sum_of_sqrs += weight * weight
    return numpy.sqrt(sum_of_sqrs)

if RUN_TESTS:
    assert round(calc_query_vector_length(tc_corpus, qtc_tfs), 4) == 4.0337

In [76]:
def calculate_query_tfs_veclens_by_qid(corpus, queries_dict):
    """
    Given the corpus and the queries_dict that has query_id as key and list of query terms as value, utilize the IDF
    value for a term in the corpus to calculate the query term frequences.
    Attributes:
        corpus (Corpus): A Corpus object.
        queries (dict(int, list[str])): Return a dict for which the key is an int value representing query_id and 
        value is a list of strings representing the terms for that query.
    Returns:
        q_tfs (dict[int, list[int, flot]]): A dict with query_id as key and value is a list of len 2, with query_tf
        at index 0 and query_vectorlength at index 1.
    """
    q_tfs_veclens = {}
    for qid, qterms in queries_dict.items():
        q_tf = calc_query_tfs(corpus, qterms)
        q_veclen = calc_query_vector_length(corpus, q_tf)
        q_tfs_veclens[qid] = [q_tf, q_veclen] # {query id: [query term frequency, query veclen]}
    return q_tfs_veclens

In [77]:
def score_documents_for_cos_sim(corpus, query_tfs_veclens_by_qid, doclens, binary_file):
    scores_by_qid = {}
    for qid, qterm in query_tfs_veclens_by_qid.items():
        score = defaultdict(float)
        for term in qterm[0]:
            try:
                postings_list = get_postings_list_for_term(term, corpus.lexicon, binary_file)
                for posting in postings_list:
                    score[posting.doc_id] += query_tfs_veclens_by_qid[qid][0][term] * corpus.lexicon[term].idf * posting.term_freq * corpus.lexicon[term].idf
            except KeyError:
                pass
        for doc_id, value in score.items():
            score[doc_id] /= (doclens[float(doc_id)] * query_tfs_veclens_by_qid[qid][1]) if (doclens[float(doc_id)] * query_tfs_veclens_by_qid[qid][1]) !=0 else .00000000000000001 # account for div by 0
        scores_by_qid[qid] = score
    return scores_by_qid

if RUN_TESTS:
    # for correctness of cosine scores, check assertions below
    tc_corpus = populate_idf_values_in_corpus(tc_corpus)
    assert round(tc_corpus.lexicon["red"].idf, 6) == .736966
    assert round(tc_corpus.lexicon["blue"].idf, 6) == 1.321928
    tc_doclens = calc_doc_vector_lengths(tc_corpus, test_binary_file)
    assert round(tc_doclens[3], 4) == 8.3845
    assert round(tc_doclens[5], 4) == .7370
    assert round(tc_doclens[7], 4) == 7.1402
    qtc2 = "want blue blue blue red"
    qtc2 = qtc2.split()
    qtc_dict2 = {"1":qtc2}
    tc_query_veclens_by_qid = calculate_query_tfs_veclens_by_qid(tc_corpus, qtc_dict2)
    assert str(tc_query_veclens_by_qid["1"][0]) == "{'blue': 3, 'red': 1}"
    assert round(tc_query_veclens_by_qid["1"][1], 4) == 4.0337
    tc_scores = score_documents_for_cos_sim(tc_corpus, tc_query_veclens_by_qid, tc_doclens, test_binary_file)
    assert round(tc_scores["1"][3], 4) == .8875
    assert round(tc_scores["1"][5], 4) == .1827
    assert round(tc_scores["1"][7], 4) == .5338

## Create Lexicon and Postings

In [27]:
def create_inverted_index(filename, binary_file_name, lexicon_file_name):
    """
    Create binary inverted index for file read. Split file based on "</P>" into individual documents. 
    Conduct a validation check to ensure document can be parsed. Enrich document. Create two binary files
    Parameters:
        filename: Filename representing the file consumed.
        binary_file_name: Name of binary file to which interleaving  32-bit int doc_ids and term_freqs are written to.
        lexicon_file_name: Name of .pk file to which the lexicon object is written to.
    Returns:
        corpus (Corpus): Object represents corpus generated from the file consumed.
    """
    corpus = Corpus()
    postings = {}
    content = read_file(filename)
    content = re.split("</P>", content)
    docs_processed = 0
    for doc in content:
        validation_check = re.findall(r"\d+", doc)
        try: 
            validation_check[0]
            clean_doc = strip_doc(doc)
            doc_id, modified_doc = capture_and_remove_doc_id(clean_doc)
            terms = split_into_tokens(modified_doc)
            enrich_lexicon(corpus, postings, terms, doc_id) 
            docs_processed += 1
        except IndexError:
            pass
    sort_postings(postings)
    sorted_lexicon = sort_terms(corpus.lexicon)
    corpus.lexicon = sorted_lexicon
    corpus.docs_processed = docs_processed
    # write postings dict to .bin file
    binary_file_write(corpus.lexicon, postings, binary_file_name)
    print(f"Total docs processed: {corpus.docs_processed}")
    populate_idf_values_in_corpus(corpus)
    # write corpus to .pk file
    write_lexicon_to_file(corpus, lexicon_file_name)
    return corpus


## Create Inverted Index - Write Binary File

In [30]:
inverted_index = "term/term_inverted_index.bin"
lexicon_file = "term/term_corpus.pk"
ranked_a = "term/term_rankedlist_a.txt"
ranked_b = "term/term_rankedlist_b.txt"

text_file = "data/animal.txt"
cord19_topics_keyword = "data/animal.topics.txt"
cord19_topics_questions = "data/animal.topics.txt"

# text_file = "data/cord19.txt"
# cord19_topics_keyword = "data/cord19.topics.keyword.txt"
# cord19_topics_questions = "data/cord19.topics.question.txt"

start = time.time()
# Write .bin and .pk files
corpus = create_inverted_index(text_file, inverted_index, lexicon_file)
end = time.time()

elapsed = end - start
mins = elapsed // 60
secs = elapsed - (60*mins)
print(f"Run time to create inverted index: {round(mins)}m {round(secs, 2)}s")



Total docs processed: 8
Run time to create inverted index: 0m 0.01s


## Score Documents

In [78]:
def get_scores_by_qid(corpus, binary_file, query_file):
    doclens = calc_doc_vector_lengths(corpus, inverted_index)
    queries = process_query_file(query_file)
    
    tf_idfs1 = calc_query_tfs(corpus, queries["1"])
    print("TF/IDF weights for each query term in first query")
    for t, qtf in tf_idfs1.items():
        print(f"  {t}: {qtf * calc_idf_for_term(corpus, t)}")
    
    query_tfs_veclens_by_qid = calculate_query_tfs_veclens_by_qid(corpus, queries)
    
    start = time.time()
    scores_by_qid = score_documents_for_cos_sim(corpus, query_tfs_veclens_by_qid, doclens, binary_file)
    end = time.time()
    
    elapsed = end - start
    mins = elapsed // 60
    secs = elapsed - (60*mins)
    print(f"All queries scored in: {round(mins)}m {round(secs, 2)}s\t")
    return scores_by_qid

## Create Ranked List

In [32]:
print("Processing keyword topic queries...")
scores_by_qid = get_scores_by_qid(corpus, inverted_index, cord19_topics_keyword)
rank_top_100_scores(scores_by_qid, ranked_a)
print("Keyword topic queries processing complete.")

Processing keyword topic queries...
TF/IDF weights for each query term in first query
  bird: 0.0
  cat: 3.0
  dog: 1.0
All queries scored in: 0m 0.0s	
Keyword topic queries processing complete.


In [33]:
print("\nProcessing question topic queries...")
scores_by_qid = get_scores_by_qid(corpus, inverted_index, cord19_topics_questions)
rank_top_100_scores(scores_by_qid, ranked_b)
print("Question topic queries processing complete.")


Processing question topic queries...
TF/IDF weights for each query term in first query
  bird: 0.0
  cat: 3.0
  dog: 1.0
All queries scored in: 0m 0.0s	
Question topic queries processing complete.


In [34]:
corpus = read_lexicon_from_file(lexicon_file)
lexicon = corpus.lexicon
print(f"\nNumber of docs indexed: {corpus.docs_processed}")
collection_size = corpus.collection_freq
vocab_size = len(lexicon)
print(f"Collection size: {collection_size}")
print(f"Vocab size: {vocab_size}")
bin_file_size = os.path.getsize(inverted_index)
lex_size = os.path.getsize(lexicon_file)
print(f"inverted_index.bin size: {round(bin_file_size/(1024*1024.0), 2)} MB")
print(f"lexicon.pk size: {round(lex_size/(1024*1024.0), 2)} MB")


Number of docs indexed: 8
Collection size: 50
Vocab size: 7
inverted_index.bin size: 0.0 MB
lexicon.pk size: 0.0 MB


In [35]:
# print(read_lexicon_from_file(lexicon_file).lexicon)

{'aardvark': doc_freq: 4; offset: 0; id: 6; idf: 1.0, 'bid': doc_freq: 1; offset: 32; id: 10; idf: 3.0, 'bird': doc_freq: 8; offset: 40; id: 4; idf: 0.0, 'cat': doc_freq: 1; offset: 104; id: 7; idf: 3.0, 'dog': doc_freq: 4; offset: 112; id: 5; idf: 1.0, 'egret': doc_freq: 2; offset: 144; id: 8; idf: 2.0, 'fish': doc_freq: 2; offset: 160; id: 9; idf: 2.0}


# CLIR Query Processing

In [88]:
inverted_index = "term/term_inverted_index.bin"
lexicon_file = "term/term_corpus.pk"
corpus = read_lexicon_from_file(lexicon_file)
lexicon = corpus.lexicon

In [90]:
# out keywords
ranked_en_base_keywords = "term/ranked/term_rankedlist_en_base_keywords.txt"
ranked_hi2en_gt_keywords = "term/ranked/term_rankedlist_hi2en_gt_keywords.txt"
ranked_hi2en_emb_keywords = "term/ranked/term_rankedlist_hi2en_emb_keywords.txt"
ranked_hi2en_emb_synset_keywords = "term/ranked/term_rankedlist_hi2en_emb_synset_keywords.txt"
ranked_hi2en_emb_no_oov_keywords = "term/ranked/term_rankedlist_hi2en_emb_no_oov_keywords.txt"
ranked_hi2en_gt_emb_synset_translit_keywords = "term/ranked/term_rankedlist_hi2en_gt_emb_synset_translit_keywords.txt"

# out questions
ranked_en_base_questions = "term/ranked/term_rankedlist_en_base_questions.txt"
ranked_hi2en_gt_questions = "term/ranked/term_rankedlist_hi2en_gt_questions.txt"
ranked_hi2en_emb_questions = "term/ranked/term_rankedlist_hi2en_emb_questions.txt"
ranked_hi2en_emb_synset_questions = "term/ranked/term_rankedlist_hi2en_emb_synset_questions.txt"
ranked_hi2en_emb_no_oov_questions = "term/ranked/term_rankedlist_hi2en_emb_no_oov_questions.txt"
ranked_hi2en_gt_emb_synset_translit_questions = "term/ranked/term_rankedlist_hi2en_gt_emb_synset_translit_questions.txt"

In [91]:
keyword_files = [
    [files.cord19_topics_keyword, ranked_en_base_keywords],
    [files.cord19_topics_keyword_HIN2ENG_GT, ranked_hi2en_gt_keywords],
    [files.cord19_topics_keyword_HIN2ENG_EMB, ranked_hi2en_emb_keywords],
    [files.cord19_topics_keyword_HIN2ENG_EMB_SYNSET, ranked_hi2en_emb_synset_keywords],
    [files.cord19_topics_keyword_HIN2ENG_EMB_NO_OOV, ranked_hi2en_emb_no_oov_keywords],
    [files.cord19_topics_keyword_HIN2ENG_GT_EMB_SYNSET_TRANSLIT, ranked_hi2en_gt_emb_synset_translit_keywords]
]

In [92]:
question_files = [
    [files.cord19_topics_questions, ranked_en_base_questions],
    [files.cord19_topics_questions_HIN2ENG_GT, ranked_hi2en_gt_questions],
    [files.cord19_topics_questions_HIN2ENG_EMB, ranked_hi2en_emb_questions],
    [files.cord19_topics_questions_HIN2ENG_EMB_SYNSET, ranked_hi2en_emb_synset_questions],
    [files.cord19_topics_questions_HIN2ENG_EMB_NO_OOV, ranked_hi2en_emb_no_oov_questions],
    [files.cord19_topics_questions_HIN2ENG_GT_EMB_SYNSET_TRANSLIT, ranked_hi2en_gt_emb_synset_translit_questions]
]

In [93]:
# Create hi2en ranked list keywords for terms

In [94]:
for kwfile in keyword_files:
    print(f"Processing keyword topic queries {kwfile}...")
    start = time.time()
    scores_by_qid = get_scores_by_qid(corpus, inverted_index, kwfile[0])
    rank_top_100_scores(scores_by_qid, kwfile[1])
    end = time.time()
    elapsed = end - start
    mins = elapsed // 60
    secs = elapsed - (60*mins)
    print(f"Keyword topic queries processing complete in {mins}m {secs}s.\n")
    

Processing keyword topic queries ['data/cord19.topics.keyword.txt', 'term/ranked/term_rankedlist_en_base_keywords.txt']...
TF/IDF weights for each query term in first query
  coronavirus: 2.0013212358202472
  origin: 5.4422305313054355
All queries scored in: 0m 21.88s	
Keyword topic queries processing complete in 2.0m 17.514423847198486s.

Processing keyword topic queries ['data/hi/parsed/cord19.topics.keyword.hi2eng.gt.txt', 'term/ranked/term_rankedlist_hi2en_gt_keywords.txt']...
TF/IDF weights for each query term in first query
  koronavirus: 17.544534348518084
  origin: 5.4422305313054355
All queries scored in: 0m 11.95s	
Keyword topic queries processing complete in 1.0m 44.49653697013855s.

Processing keyword topic queries ['data/hi/parsed/cord19.topics.keyword.hi2eng.emb.txt', 'term/ranked/term_rankedlist_hi2en_emb_keywords.txt']...
TF/IDF weights for each query term in first query
  coxsackievirus: 9.349777494095834
  polerovirus: 14.959571847796926
  erbovirus: 14.95957184779692

In [95]:
# Create hi2en ranked list questions for terms

In [96]:
for qfile in question_files:
    print(f"Processing question topic queries {qfile}...")
    start = time.time()
    scores_by_qid = get_scores_by_qid(corpus, inverted_index, qfile[0])
    rank_top_100_scores(scores_by_qid, qfile[1])
    end = time.time()
    elapsed = end - start
    mins = elapsed // 60
    secs = elapsed - (60*mins)
    print(f"Question topic queries processing complete in {mins}m {secs}s.\n")

Processing question topic queries ['data/cord19.topics.question.txt', 'term/ranked/term_rankedlist_en_base_questions.txt']...
TF/IDF weights for each query term in first query
  origin: 5.4422305313054355
  covid: 1.418535749448225
  19: 1.2539034654992969
All queries scored in: 0m 43.78s	
Question topic queries processing complete in 2.0m 26.91602110862732s.

Processing question topic queries ['data/hi/parsed/cord19.topics.question.hi2eng.gt.txt', 'term/ranked/term_rankedlist_hi2en_gt_questions.txt']...
TF/IDF weights for each query term in first query
  origin: 5.4422305313054355
All queries scored in: 0m 31.98s	
Question topic queries processing complete in 2.0m 14.100980997085571s.

Processing question topic queries ['data/hi/parsed/cord19.topics.question.hi2eng.emb.txt', 'term/ranked/term_rankedlist_hi2en_emb_questions.txt']...
TF/IDF weights for each query term in first query
  origin: 5.4422305313054355
  origins: 7.672629110858896
  originated: 6.90338575110685
  wondering: 13.