# Install Maven
# Clone Anserini repository
# Change directory to Anserini
# Clean and package Anserini


In [None]:
!apt-get install maven
!git clone https://github.com/castorini/anserini.git
%cd anserini
!mvn clean package appassembler:assemble


# Create a directory named "collections/msmarco-passage" to store the MS MARCO Passage collection


# Download the MS MARCO Passage collection from the official website

# Extract the downloaded archive into the "collections/msmarco-passage" directory


In [None]:
!mkdir collections/msmarco-passage
!wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz -P collections/msmarco-passage
!tar xvfz collections/msmarco-passage/collectionandqueries.tar.gz -C collections/msmarco-passage


# Convert MS MARCO Passage collection from TSV to JSONL

In [None]:



!python /content/anserini/anserini-tools/scripts/msmarco/convert_collection_to_jsonl.py \
  --collection-path /content/anserini/collections/msmarco-passage/collection.tsv \
  --output-folder /content/anserini/collections/msmarco-passage/collection_jsonl



Converting collection...
Converted 0 docs, writing into file 1
Converted 100,000 docs, writing into file 1
Converted 200,000 docs, writing into file 1
Converted 300,000 docs, writing into file 1
Converted 400,000 docs, writing into file 1
Converted 500,000 docs, writing into file 1
Converted 600,000 docs, writing into file 1
Converted 700,000 docs, writing into file 1
Converted 800,000 docs, writing into file 1
Converted 900,000 docs, writing into file 1
Converted 1,000,000 docs, writing into file 2
Converted 1,100,000 docs, writing into file 2
Converted 1,200,000 docs, writing into file 2
Converted 1,300,000 docs, writing into file 2
Converted 1,400,000 docs, writing into file 2
Converted 1,500,000 docs, writing into file 2
Converted 1,600,000 docs, writing into file 2
Converted 1,700,000 docs, writing into file 2
Converted 1,800,000 docs, writing into file 2
Converted 1,900,000 docs, writing into file 2
Converted 2,000,000 docs, writing into file 3
Converted 2,100,000 docs, writing i

# Filter the MS MARCO Passage development set queries to a small subset

In [None]:
!python /content/anserini/anserini-tools/scripts/msmarco/filter_queries.py \
  --qrels collections/msmarco-passage/qrels.dev.small.tsv \
  --queries collections/msmarco-passage/queries.dev.tsv \
  --output collections/msmarco-passage/queries.dev.small.tsv


Done!


# Index the MS MARCO Passage collection into a Lucene index using the Anserini tool

In [None]:
!target/appassembler/bin/IndexCollection \
  -collection JsonCollection \
  -input /content/anserini/collections/msmarco-passage/collection_jsonl \
  -index indexes/msmarco-passage/lucene-index-msmarco \
  -generator DefaultLuceneDocumentGenerator \
  -threads 2 -storePositions -storeDocvectors -storeRaw


2024-01-31 10:33:42,115 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:204) - Setting log level to INFO
2024-01-31 10:33:42,118 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:208) - AbstractIndexer settings:
2024-01-31 10:33:42,119 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:209) -  + DocumentCollection path: /content/anserini/collections/msmarco-passage/collection_jsonl
2024-01-31 10:33:42,121 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:210) -  + CollectionClass: JsonCollection
2024-01-31 10:33:42,122 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:211) -  + Index path: indexes/msmarco-passage/lucene-index-msmarco
2024-01-31 10:33:42,123 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:212) -  + Threads: 2
2024-01-31 10:33:42,124 INFO  [main] index.AbstractIndexer (AbstractIndexer.java:213) -  + Optimize (merge segments)? false
2024-01-31 10:33:42,178 INFO  [main] index.IndexCollection (IndexCollection.java:246) - Using De

# Search the MS MARCO Passage collection using the Anserini tool with BM25 retrieval model

In [None]:
!target/appassembler/bin/SearchCollection \
  -index indexes/msmarco-passage/lucene-index-msmarco \
  -topics collections/msmarco-passage/queries.dev.small.tsv \
  -topicReader TsvInt \
  -output runs/run.msmarco-passage.dev.small.tsv -format msmarco \
  -parallelism 2 \
  -bm25 -bm25.k1 0.82 -bm25.b 0.68 -hits 1000


2024-01-31 10:43:54,787 INFO  [main] search.SearchCollection (SearchCollection.java:991) - Index: indexes/msmarco-passage/lucene-index-msmarco
2024-01-31 10:43:55,151 INFO  [main] search.SearchCollection (SearchCollection.java:994) - Fields: []
2024-01-31 10:43:55,155 INFO  [main] search.SearchCollection (SearchCollection.java:1304) - Using DefaultEnglishAnalyzer
2024-01-31 10:43:55,156 INFO  [main] search.SearchCollection (SearchCollection.java:1305) - Stemmer: porter
2024-01-31 10:43:55,159 INFO  [main] search.SearchCollection (SearchCollection.java:1306) - Keep stopwords? false
2024-01-31 10:43:55,161 INFO  [main] search.SearchCollection (SearchCollection.java:1307) - Stopwords file: null
2024-01-31 10:43:55,318 INFO  [main] search.SearchCollection (SearchCollection.java:1318) - runtag: Anserini
2024-01-31 10:44:11,216 INFO  [pool-3-thread-3] search.SearchCollection$SearcherThread (SearchCollection.java:893) - ranker: bm25(k1=0.82,b=0.68), reranker: default: 100 queries processed
20

In [None]:
!head runs/run.msmarco-passage.dev.small.tsv


1048585	7187158	1
1048585	7187157	2
1048585	7187163	3
1048585	7546327	4
1048585	7187160	5
1048585	8227279	6
1048585	7617404	7
1048585	7187156	8
1048585	2298838	9
1048585	7187155	10


# Evaluate the BM25 retrieval model on the MS MARCO Passage development subset

In [None]:
!python /content/anserini/anserini-tools/scripts/msmarco/msmarco_passage_eval.py \
 /content/anserini/collections/msmarco-passage/queries.dev.small.tsv runs/run.msmarco-passage.dev.small.tsv

#####################
MRR @10: 0.0
QueriesRanked: 6980
#####################
