# SETUP

In [1]:
!pip install --upgrade git+https://github.com/terrier-org/pyterrier.git#egg=python-terrier
!pip install --upgrade git+https://github.com/andreaschari/pyterrier_t5.git

Collecting python-terrier
  Cloning https://github.com/terrier-org/pyterrier.git to /tmp/pip-install-p605u0l4/python-terrier_6ea372db3fd34875a4e47ca777009312
  Running command git clone -q https://github.com/terrier-org/pyterrier.git /tmp/pip-install-p605u0l4/python-terrier_6ea372db3fd34875a4e47ca777009312
  Resolved https://github.com/terrier-org/pyterrier.git to commit 943764f32fc04ae0f164f3ec1a399cd28ea8e94b
Collecting git+https://github.com/andreaschari/pyterrier_t5.git
  Cloning https://github.com/andreaschari/pyterrier_t5.git to /tmp/pip-req-build-2j4xeagd
  Running command git clone -q https://github.com/andreaschari/pyterrier_t5.git /tmp/pip-req-build-2j4xeagd
  Resolved https://github.com/andreaschari/pyterrier_t5.git to commit 3c3773e5fc0028f6e8fb3e4a444f616575910893






In [2]:
import os
import multiprocessing
import nltk
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from nltk.corpus import stopwords
import pandas as pd

In [3]:
import ir_datasets

In [4]:
import pyterrier as pt
pt.init(boot_packages=['com.github.terrierteam:terrier-prf:-SNAPSHOT'])

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [5]:
from pyterrier_t5 import mT5ReRanker, MonoT5ReRanker

In [6]:
from pyterrier_xlang.preprocess import fa, spacy_preprocessor

In [7]:
from pyterrier.measures import *

# Farsi

## Load Pre-processor

In [8]:
fa_pre = fa(remove_stops=False)

## Load Datasets

In [89]:
dataset = pt.get_dataset('irds:hc4/fa/dev')

## Index Dataset with terrier

In [90]:
# No Stemming
if not os.path.exists('./hc4-fa-train-nostem'):
    indexer = pt.IterDictIndexer('./hc4-fa-train-nostem', meta={"docno": 36})
    indexer.setProperty("tokeniser", "UTFTokeniser") # Replaces the default EnglishTokeniser, which makes assumptions specific to English
    indexer.setProperty("termpipelines", "") # Removes the default PorterStemmer (English)
    index_nostem = indexer.index(dataset.get_corpus_iter())
else:
    index_nostem = pt.IndexRef.of('./hc4-fa-train-nostem/data.properties')

## BM25 Retrieval

In [47]:
bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')

In [48]:
rm3 = pt.rewrite.RM3(index_nostem, fb_terms=10, fb_docs=3)

In [49]:
pipe_qe = bm25_nostem >> rm3 >> bm25_nostem

In [50]:
bm25_pipeline = fa_pre >> pipe_qe

In [51]:
bm25_out = bm25_pipeline(dataset.get_topics('ht_title', tokenise_query=False))

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)


In [52]:
pt.io.write_results(bm25_out, "bm25_rm3_retr_hc4_fa_dev_ht.gz")

## Run Retrieval Experiments

In [53]:
pt.Experiment([bm25_pipeline], dataset.get_topics('ht_title', tokenise_query=False), dataset.get_qrels(), [nDCG@100, AP@100, R@1000, Judged@10], names=['Farsi pipeline'])

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)


Unnamed: 0,name,nDCG@100,AP@100,R@1000,Judged@10
0,Farsi pipeline,0.231863,0.163881,0.68845,0.36


# Russian

## Load Pre-processor

In [26]:
ru_pre = spacy_preprocessor('ru_core_news_sm')

## Load Dataset

In [27]:
dataset = pt.get_dataset('irds:hc4/ru/dev')

## Index Dataset with Terrier

In [28]:
# No Stemming
if not os.path.exists('./hc4-ru-dev-nostem'):
    indexer = pt.IterDictIndexer('./hc4-ru-dev-nostem', meta={"docno": 36})
    indexer.setProperty("tokeniser", "UTFTokeniser") # Replaces the default EnglishTokeniser, which makes assumptions specific to English
    indexer.setProperty("termpipelines", "") # Removes the default PorterStemmer (English)
    index_nostem = indexer.index(dataset.get_corpus_iter())
else:
    index_nostem = pt.IndexRef.of('./hc4-ru-dev-nostem/data.properties')

## BM25 Retrieval

In [29]:
bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')

In [30]:
rm3 = pt.rewrite.RM3(index_nostem, fb_terms=10, fb_docs=3)

In [31]:
pipe_qe = bm25_nostem >> rm3 >> bm25_nostem

In [32]:
bm25_pipeline = ru_pre >> pipe_qe

In [33]:
bm25_out = bm25_pipeline(dataset.get_topics('ht_title', tokenise_query=False))

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)


In [62]:
pt.io.write_results(bm25_out, "bm25_rm3_retr_hc4_ru_dev_ht.gz")

## Run Retrieval Experiments

In [35]:
pt.Experiment([bm25_pipeline], dataset.get_topics('mt_title', tokenise_query=False), dataset.get_qrels(), [nDCG@100, AP@100, R@1000, Judged@10], names=['Russian pipeline'])

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)


Unnamed: 0,name,nDCG@100,AP@100,R@1000,Judged@10
0,Russian pipeline,0.227183,0.089263,0.554683,0.5


# Chinese

## Load Pre-processor

In [16]:
zh_pre = spacy_preprocessor('zh_core_web_sm', supports_stem=False)

In [28]:
from spacy.lang.zh import Chinese
nlp = Chinese()
cfg = {"segmenter": "jieba"}
nlp = Chinese.from_config({"nlp": {"tokenizer": cfg}})
zh_pre = spacy_preprocessor(nlp, supports_stem=False)

## Load Dataset

In [17]:
dataset = pt.get_dataset('irds:hc4/zh/dev')

## Index Dataset with Terrier

In [18]:
# No Stemming
if not os.path.exists('./hc4-zh-dev-nostem'):
    indexer = pt.IterDictIndexer('./hc4-zh-dev-nostem', meta={"docno": 36})
    indexer.setProperty("tokeniser", "UTFTokeniser") # Replaces the default EnglishTokeniser, which makes assumptions specific to English
    indexer.setProperty("termpipelines", "") # Removes the default PorterStemmer (English)
    index_nostem = indexer.index(dataset.get_corpus_iter())
else:
    index_nostem = pt.IndexRef.of('./hc4-zh-dev-nostem/data.properties')

## BM25 Retrieval

In [19]:
bm25_nostem = pt.BatchRetrieve(index_nostem, wmodel='BM25')

In [20]:
rm3 = pt.rewrite.RM3(index_nostem, fb_terms=10, fb_docs=3)

In [21]:
pipe_qe = bm25_nostem >> rm3 >> bm25_nostem

In [22]:
bm25_pipeline = zh_pre >> pipe_qe

In [72]:
bm25_out = bm25_pipeline(dataset.get_topics('ht_title', tokenise_query=False))

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)


In [73]:
pt.io.write_results(bm25_out, "bm25_rm3_retr_hc4_zh_dev_ht.gz")

## Run Retrieval Experiments

In [23]:
pt.Experiment([bm25_pipeline], dataset.get_topics('mt_title', tokenise_query=False), dataset.get_qrels(), [nDCG@100, AP@100, R@1000, Judged@10], names=['Chinese pipeline'])

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)


Unnamed: 0,name,nDCG@100,AP@100,R@1000,Judged@10
0,Chinese pipeline,0.099316,0.082921,0.194643,0.1


# mT5 Retrieval

In [None]:
dataset = pt.get_dataset('irds:hc4/fa/dev')

In [18]:
bm25_out = pt.io.read_results('bm25_rm3_retr_hc4_fa_dev.gz')

In [19]:
bm25_out_with_queries = pd.merge(bm25_out, dataset.get_topics(), on='qid')

There are multiple query fields available: ('title', 'description', 'ht_title', 'ht_description', 'mt_title', 'mt_description', 'narrative_by_relevance', 'report', 'report_url', 'report_date', 'translation_lang'). To use with pyterrier, provide variant or modify dataframe to add query column.


In [None]:
monoT5 = mT5ReRanker()

In [None]:
mt5_pipeline = pt.text.get_text(dataset, "text") >> monoT5

# mT5 Retrieval

In [None]:
mt5_out = mt5_pipeline(bm25_out_with_queries)

In [None]:
pt.io.write_results(mt5_out, "mt5-base-mmarco-v2_retr_hc4_fa_dev.gz")