# Task 2

In [1]:
import numpy as np
import pandas as pd

import pickle
from tqdm import tqdm

In [2]:
from task1 import *

[nltk_data] Downloading package punkt to /Users/ryanl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ryanl/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ryanl/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ryanl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Use candidate-passages-top1000.tsv for this task. Using the vocabulary of terms identified in Task 1 (you will need to choose between removing or keeping stop words), build an inverted index for the collection so that you can retrieve passages in an efficient way.

In [3]:
def get_vocab():
    
    # obtaint the vocabulary
    passage_collection = load_collection('passage-collection.txt')
    cleaned_passages = process_passage(passage_collection, remove_sw=True, lemma=True)
    freq_dict = get_freq_dict(cleaned_passages, order=True)
    vocab = [item[0] for item in freq_dict.items()]
    
    return vocab


def passage_to_id():
    
    # tokenize the passages and map to pid
    candidate_passages = pd.read_csv('candidate-passages-top1000.tsv', sep='\t', header=None, names=['qid','pid','query','passage'])
    cleaned_candidate_passages = process_passage(candidate_passages['passage'], remove_sw=True, lemma=True)
    pid_passage_dict = dict(zip(candidate_passages['pid'], cleaned_candidate_passages))
    
    return candidate_passages, pid_passage_dict


def get_inverted_index(vocab, pid_passage_dict):

    # initialization
    inverted_index = {}
    
    # store passage id, frequency, position information
    for pid, passages in tqdm(pid_passage_dict.items()):
        for term in passages:   
            if term not in vocab:   # skip the term is not in vocabulary
                continue
            freq = passages.count(term)
            idx = passages.index(term)
            if term not in inverted_index:
                inverted_index[term] = {pid:[freq,idx]}
            else:
                inverted_index[term].update({pid:[freq,idx]})
    
    return inverted_index

In [6]:
vocab = get_vocab()

In [7]:
candidate_passages, pid_passage_dict = passage_to_id()

In [10]:
inverted_index = get_inverted_index(vocab, pid_passage_dict)

100%|██████████████████████████████████| 182469/182469 [03:36<00:00, 841.65it/s]


In [11]:
with open('inverted_index.pkl', 'wb') as f:
    pickle.dump(inverted_index, f)

In [None]:
with open('inverted_index.pkl', 'rb') as f:
    inverted_index = pickle.load(f)

Testing out the inverted index:

In [15]:
term = 'transformer'

info = inverted_index[term]
freq = [v[0] for k,v in info.items()]
idx = [v[1] for k,v in info.items()]

print('pid: ', info.keys())
print('occurance: ', freq)
print('position: ', idx)

pid:  dict_keys([8627699, 6424923, 6283491, 225520, 2392159, 7724585, 3992780, 4009275, 4180925, 7476253, 4009271, 3244160, 3594066, 4325280, 2767473, 4712567, 7724579, 7909654, 3074046, 184373, 6040723, 1502649, 5981112, 5981115, 1982893, 4147882, 4325277])
occurance:  [1, 1, 2, 2, 1, 2, 1, 1, 2, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2]
position:  [7, 5, 0, 8, 18, 11, 2, 13, 4, 6, 9, 0, 1, 17, 10, 13, 12, 13, 7, 1, 14, 11, 25, 5, 22, 5, 13]
