<a href="https://colab.research.google.com/github/soumya-ranjan-sahoo/GANs---PyTorch/blob/master/crowdWorkersDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Mounting your Google Drive
We strongly recommend mounting your Google drive, so that the Capreolus cache and results directory persist after the Colab runtime terminates. (This is not required to run Capreolus, however.)

You can mount your gdrive in one of two ways:

**recommended**: Click the file browser on the left, and then click "Mount Drive" in the browser's upper righthand corner. This will ensure the drive is always mounted when you run the notebook.

Alternatively, wait for the code in the following cell to manually mount the drive. This will prompt you to handle the OAuth authentication yourself.

If you don't want to use gdrive at all, don't run the next cell.



In [12]:
import os
if not os.path.exists("/content/drive"):
  from google.colab import drive
  drive.mount("/content/drive")

print("Google drive mounted?", os.path.exists("/content/drive"))

# if your Google drive is mounted, place both the pip cache and Capreolus' data directories on the gdrive
if os.path.exists("/content/drive"):
  # disabled because pip complains about directory ownership and refuses to cache wheels
  #!rm -rf ~/.cache/pip
  #!mkdir -p "/content/drive/My Drive/capreolus/pipcache" && ln -s "/content/drive/My Drive/capreolus/pipcache" ~/.cache/pip && echo "pip cache directory created on Google drive (under capreolus/)"

  capreolus_homedir = os.path.expanduser("~/.capreolus")
  if os.path.exists(capreolus_homedir):
    if not os.path.islink(capreolus_homedir):
      print(f"WARNING: {capreolus_homedir} already exists, so we did not store it to the Google drive. If this is not what you expect, remove this directory and re-run.")
  else:
    print("Please make sure that there's a 'capreolus' directory in your drive root folder - if it doesn't exist, create it")
    !ln -s "/content/drive/My Drive/capreolus/" ~/.capreolus && echo "~/.capreolus linked to Google drive (under capreolus/)"

print("Capreolus directories stored on Google drive?", os.path.islink(capreolus_homedir))

Mounted at /content/drive
Google drive mounted? True
Please make sure that there's a 'capreolus' directory in your drive root folder - if it doesn't exist, create it
~/.capreolus linked to Google drive (under capreolus/)
Capreolus directories stored on Google drive? True


In [3]:
!rm -rf ~/.capreolus


In [17]:
pip install capreolus

Collecting capreolus
[?25l  Downloading https://files.pythonhosted.org/packages/f5/6b/7da30f766b16c103f8593530ad7367091e87627a68d25f0f4ef32fe3d930/capreolus-0.2.3-py3-none-any.whl (10.8MB)
[K     |████████████████████████████████| 10.8MB 2.8MB/s 
Collecting mock
  Downloading https://files.pythonhosted.org/packages/cd/74/d72daf8dff5b6566db857cfd088907bb0355f5dd2914c4b3ef065c790735/mock-4.0.2-py3-none-any.whl
Collecting pytorch-pretrained-bert==0.4
[?25l  Downloading https://files.pythonhosted.org/packages/95/68/84de54aea460eb5b2e90bf47a429aacc1ce97ff052ec40874ea38ae2331d/pytorch_pretrained_bert-0.4.0-py3-none-any.whl (45kB)
[K     |████████████████████████████████| 51kB 7.9MB/s 
[?25hCollecting annoy==1.15.2
[?25l  Downloading https://files.pythonhosted.org/packages/cc/b2/37c2e81952bc2ea7db909b5a698079a432197dc722ac68d61d218878499f/annoy-1.15.2.tar.gz (636kB)
[K     |████████████████████████████████| 645kB 56.3MB/s 
[?25hCollecting torch==1.4
[?25l  Downloading https://files.p

### Running a full reranking pipeline via python
#### WARNING: Takes half an hour to complete since this cell includes downloading the data, indexing it, running a searcher on it, and then training a neural model
##### (Figure 3. in the paper)


In [25]:
import os
import shutil

# #from capreolus import constants
# from capreolus.utils.common import download_file, hash_file
# from capreolus.utils.loginit import get_logger

from capreolus.collection import Collection
from capreolus.benchmark import Benchmark
from profane import Dependency

# logger = get_logger(__name__)
# PACKAGE_PATH = constants["PACKAGE_PATH"]

# print(PACKAGE_PATH)

In [21]:
@Collection.register
class CROWDWORKERS(Collection):
    """A Non-factoid Question Answering Benchmark from Hashemi et al. [1]
    [1] Helia Hashemi, Mohammad Aliannejadi, Hamed Zamani, and W. Bruce Croft. 2020. ANTIQUE: A non-factoid question answering benchmark. ECIR 2020.
    """

    module_name = "crowdWorkers"
    _path = "/content/drive/My Drive/crowdWorkers"

    collection_type = "TrecCollection"
    generator_type = "DefaultLuceneDocumentGenerator"


In [13]:
!ls /content/

drive  sample_data


In [15]:
with open('/content/drive/My Drive/topic.txt', 'r') as f: 
    print(f.readline())

# ~/.capreolus.

 <top>



In [26]:
@Benchmark.register
class _FOOCROWDWORKER(Benchmark):
    """A Non-factoid Question Answering Benchmark from Hashemi et al. [1]
    [1] Helia Hashemi, Mohammad Aliannejadi, Hamed Zamani, and W. Bruce Croft. 2020. ANTIQUE: A non-factoid question answering benchmark. ECIR 2020.
    """

    module_name = "crowdWorkers"
    dependencies = [Dependency(key="collection", module="collection", name="crowdWorkers")]
    qrel_file = "/content/drive/My Drive/crowdWorkersBenchmark/qrel.txt"
    topic_file =  "/content/drive/My Drive/crowdWorkersBenchmark/topic.txt"
    fold_file =  "/content/drive/My Drive/crowdWorkersBenchmark/fold.json"
    query_type = "title"


In [30]:
!ls /usr/local/lib/python3.6/dist-packages/capreolus

benchmark   evaluator.py  __init__.py  run.py	 task	    trainer
collection  extractor	  __pycache__  sampler	 tests	    utils
data	    index	  reranker     searcher  tokenizer


In [40]:
!vim foo

/bin/bash: vim: command not found


In [39]:
import capreolus.evaluator
from capreolus import parse_config_string
from capreolus.task.rerank import RerankTask
from profane import constants, config_list_to_dict


# 1. Downloads the ANTIQUE dataset
# 2. Runs BM25 on it with default params
# 3. Reranks the top 1000 docs from BM25 search using KNRM (3 iterations, 512 samples in each, batch size of 32)
# 4. Reports the metrics

config_string = "benchmark.name=crowdWorkers rank.searcher.name=BM25 reranker.name=KNRM reranker.trainer.niters=3"
task = RerankTask(parse_config_string(config_string))
print(dir(task))
task.print_pipeline()
task.train()
task.evaluate()


AttributeError: ignored

### Indexing and searching a collection
##### (Figure 4 in the paper)

1. Indexes the ANTIQUE collection
2. Issue a BM25 search against the index


In [None]:
from capreolus.collection import ANTIQUE
from capreolus.index import AnseriniIndex
from capreolus.searcher import BM25

collection = ANTIQUE()
index = AnseriniIndex({"stemmer": "porter"}, {"collection": collection})
index.create_index()
collection.find_document_path()

print(index.get_df("retrieval"))
print(index.get_df("retriev"))

searcher = BM25({"b": 0.75, "hits": 3}, {"index": index})

# Let us see what documents were retrieved:
print(searcher.query("information retrieval"))

index.get_doc('746919_5')

0
155
OrderedDict([('746919_5', 8.655200004577637), ('1169146_0', 7.959799766540527), ('2011132_1', 7.939199924468994)])


'as intertainment, to retrieve information, and a writing tool.....and much more of course'

#### Reranking an existing set of results
##### (Figure 5 in the paper)

In [None]:
from capreolus.searcher import Searcher
from capreolus.benchmark import ANTIQUE as AntiqueBenchmark
import os
# Uses the searcher results from the previous cell
benchmark = AntiqueBenchmark()
output_path = "searcher"

# Subsequent runs will used the cached searcher output
searcher_output_dir = searcher.query_from_file(benchmark.topic_file, output_path)

run = Searcher.load_trec_run(searcher_output_dir + "/searcher")

task = RerankTask({
    "reranker": {
        "name": "DRMM",
        "nbins": 10,
        "trainer": {
            "niters": 1
        },
    },
    "benchmark": {
        "name": "antique"
    }
})

results = task.rerank_run(run, "rerank_out_path", include_train=True)
print(results.keys())
print(results["dev"])

2020-06-10 12:26:27,155 - INFO - capreolus.searcher._anserini_query_from_file - Anserini writing runs to searcher/searcher
2020-06-10 12:26:28,166 - INFO - capreolus.searcher.filter_and_log_anserini_output - [AnseriniProcess] [main] search.SearchCollection (SearchCollection.java:239) - Index: /root/.capreolus/cache/collection-antique/index-anserini_indexstops-False_stemmer-porter/index
2020-06-10 12:26:28,349 - INFO - capreolus.searcher.filter_and_log_anserini_output - [AnseriniProcess] [main] search.SearchCollection (SearchCollection.java:276) - Language: en
2020-06-10 12:26:28,362 - INFO - capreolus.searcher.filter_and_log_anserini_output - [AnseriniProcess] [main] search.SearchCollection (SearchCollection.java:277) - Stemmer: porter
2020-06-10 12:26:28,365 - INFO - capreolus.searcher.filter_and_log_anserini_output - [AnseriniProcess] [main] search.SearchCollection (SearchCollection.java:278) - Keep stopwords? false
2020-06-10 12:26:28,510 - INFO - capreolus.searcher.filter_and_log_a

dict_keys(['dev', 'test', 'train'])
{'11706': {'11706_10': -0.373046875, '2665233_12': -0.35498046875, '11706_3': -0.322998046875}, '17665': {'3839244_1': -0.3369140625, '2281177_8': -0.339111328125, '2333554_2': -0.361328125}, '57572': {'57572_0': -0.37255859375, '3774866_0': -0.374267578125, '3631491_3': -0.364013671875}, '99112': {'661986_7': -0.366455078125, '4215565_0': -0.3642578125, '99112_1': -0.3427734375}, '125626': {'125626_2': -0.3447265625, '125626_3': -0.36767578125, '1144773_0': -0.361083984375}, '126182': {'1813636_3': -0.32421875, '4254140_6': -0.331298828125, '4256577_0': -0.3564453125}, '189452': {'2197832_1': -0.3720703125, '2967656_8': -0.368408203125, '1781827_5': -0.3779296875}, '227569': {'227569_6': -0.351806640625, '227569_10': -0.342041015625, '259554_2': -0.354248046875}, '276151': {'276151_6': -0.380859375, '276151_8': -0.33056640625, '1435289_0': -0.3583984375}, '284239': {'1977360_6': -0.37890625, '239942_7': -0.365234375, '284239_17': -0.367431640625}, '

### Implementing a new Reranker module
##### (Figure 7 in the paper)


In [None]:
from profane import ConfigOption, Dependency
import torch
from capreolus.reranker import Reranker
from capreolus.reranker.common import create_emb_layer, SimilarityMatrix

@Reranker.register
class NewModel(Reranker):
    module_name = "newmodel"
    config_spec = [
                   ConfigOption("finetune", False, "Train the embedding layer?")
    ]
    dependencies = [
                    Dependency("extractor", module="extractor", name="embedtext"),
                    Dependency(key="trainer", module="trainer", name="pytorch")
    ]

    def build_model(self):
        self.model = NewModel_Class(self.extractor, self.config)
        return self.model

    def score(self, d):
        return [
                self.model(d["posdoc"], d["query"]).view(-1),
                self.model(d["negdoc"], d["query"]).view(-1)
        ]
    
    def test(self, d):
        return self.model(d["posdoc"], d["query"]).view(-1)

class NewModel_Class(torch.nn.Module):
    def __init__(self, extractor, config):
        super(NewModel_Class, self).__init__()
        self.embedding = create_emb_layer(extractor.embeddings, non_trainable=config["finetune"])
        self.simmat = SimilarityMatrix(padding=extractor.pad)
        self.linear = torch.nn.Linear(extractor.config["maxdoclen"], 1)

    def forward(self, docidxs, queryidxs):
        doc = self.embedding(docidxs)
        query = self.embedding(queryidxs)
        
        # has shape (batch, query_len, doc_len)
        similarity_matrix = self.simmat(query, doc, queryidxs, docidxs)

        # The input to the linear layer is the similarity score between every term in the query and the first term in the doc. Weird, but let's see what happens.
        return self.linear(similarity_matrix[:, :, 0])



In [None]:
# Now that we have defined a new reranker and registered it, let's actually use it

task = RerankTask({
    "reranker": {
        "name": "newmodel",
        "finetune": True,
        "extractor": {
            "usecache": True
        },
        "trainer": {
            "niters": 5,
            "validatefreq": 5
        },
    },
    "benchmark": {
        "name": "antique"
    },
})

# Attention - the 'run' variable comes from a previous cell
predictions = task.train()
task.evaluate()

NameError: ignored