# Minimum Working Example

## Dependencies

In [1]:
#!pip install numpy
#!pip install pandas
#!pip install scikit-learn
#!pip install nltk
#!pip install stanza
#!pip install tweet-preprocessor
#!pip install wordsegment
#!pip install sentence-transformeres
#!pip install pulp # in our case requires GLPK
#!pip install python-terrier # requires JVM

In [2]:
import os
import json
import warnings
import numpy as np
import pandas as pd
import pyterrier as pt

In [3]:
warnings.filterwarnings("ignore")

In [4]:
os.environ["NLTK_DATA"] = os.path.join(os.getcwd(), ".nltk")
os.environ["STANZA_RESOURCES_DIR"] = os.path.join(os.getcwd(), ".stanza")
os.environ["SENTENCE_TRANSFORMERS_HOME"] = os.path.join(os.getcwd(), ".sbert")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
%%capture cap
import src.retrieval as retrieval
import src.reranking as reranking
import src.rescoring as rescoring

## Example Datasets

In [6]:
# wildfire example
queries_file = "data/queries.json"
documents_file = "data/documents.json"

# queries as keywords or "questions" and documents extracted from the web
queries = pd.read_json(queries_file, lines=True)
documents = pd.read_json(documents_file, lines=True)

### Queries

In [7]:
print(f"Number of queries: {len(queries)}")

Number of queries: 5


In [8]:
queries.head()

Unnamed: 0,qid,rerank_query,query
0,query-1,How many people are missing,missing
1,query-2,What is the fire containment level,containment
2,query-3,What roads are blocked / closed,tree block road closures
3,query-4,What at-risk groups are being impacted,Are there goods needing delivered
4,query-5,What events have been canceled,canceled


### Documents

In [9]:
print(f"Number of documents: {len(documents)}")

Number of documents: 2000


In [10]:
documents.head()

Unnamed: 0,docno,text,source_type
0,document-1,.@SDGE turns off power to parts of inland #San...,Twitter
1,document-2,#christmas #lilacfire #PitMad #DontBeSurpris...,Twitter
2,document-3,@realDonaldTrump Switzerland BINGER 🇨🇭 - Spec...,Twitter
3,document-4,omg this breaks my heart😰💔I hope everyone’s sa...,Twitter
4,document-5,#LilacFire UPDATE: Fire is currently at over 1...,Twitter


## Model Initialization

### Configuration

In [11]:
# configuration for the fast retrieval model
retriever_config = retrieval.TerrierRetrieverConfig(
    index_path=".index",
    index_type=3, # just in memory
    meta=["docno", "text", "source_type"],
    meta_lengths=[40, 250, 40],
    wmodel="TF_IDF"
)

# configuraiton for the 'slower' semantic reranking model
reranker_config = reranking.TerrierRerankerConfig(
    model_name_or_path="paraphrase-MiniLM-L6-v2",
    cross_encoder=False,
    max_length=256,
    batch_size=32,
    device="cpu",
    query_col="rerank_query"
)

### Modules

In [12]:
retriever = retrieval.RetrieveModule(model=retrieval.TerrierRetrieverAdapter(retriever_config), top_k=100, threshold=None)
reranker = reranking.RerankModule(model=reranking.TerrierRerankerAdapter(reranker_config), top_k=25, threshold=0.1)

In [13]:
# Stanza named entity tags
ner_tags = [
    "CARDINAL", "DATE", "EVENT", "FAC", "GPE", 
    "LOC", "MONEY", "ORDINAL", "ORG", "PERCENT", 
    "PRODUCT", "QUANTITY", "TIME"
]

# Extractor for concepts used by ILP
concepts = rescoring.EntityConceptsCreator(
    ner_tags=ner_tags,
    use_gpu=False
)

# ILP optimizer for document selection
ilp = rescoring.ILPModule(
    concepts=concepts,
    max_budget=20,
    min_budget=1,
    solver_path=None # GLPK solver path; None is default setting
)

selector = rescoring.RescoreModule(
    ilp=ilp,
    mmr=None # Do not use Maximal-Marginal Relevance for rescoring
)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

## Run the Pipeline

### Stage 1: retrieve with term frequency model

In [14]:
%%capture cap
retrieval_results = retriever.retrieve(queries=queries, corpus=documents)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


### Stage 2: rerank query-document pairs with contextualized language model

In [15]:
reranking_results = reranker.rerank(initial_results=retrieval_results)

pt.apply: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.45row/s]


### Stage 3: select maximum number of sentences with Integer Linear Programming

In [16]:
selection_results = selector.rescore(initial_results=reranking_results)

## Results

In [17]:
retrieval_results.drop(columns=["docid", "docno", "qid"])

Unnamed: 0,text,source_type,rank,score,rerank_query,query
0,Missed my frees @ Gaslamp Quarter https://t.co...,Twitter,0,1.000000,How many people are missing,missing
1,aww i miss you!! I'll be home soon!! https://...,Twitter,1,0.717618,How many people are missing,missing
2,I missed out on the Awesome cupcakes at the @k...,Twitter,2,0.717618,How many people are missing,missing
3,Friday Night 7PM. @thegoodworth @gymstandard d...,Twitter,3,0.000000,How many people are missing,missing
4,"""0% contained"" what the fuck https://t.co/snzd...",Twitter,0,1.000000,What is the fire containment level,containment
...,...,...,...,...,...,...
182,Palomar Community College District will be clo...,News,3,0.513924,What events have been canceled,canceled
183,CAL STATE San Marcos cancels classes for the r...,Twitter,4,0.513924,What events have been canceled,canceled
184,@KrisKoivisto All happening at once is rare. G...,Twitter,5,0.451714,What events have been canceled,canceled
185,Los Alamitos Race Course announced that it is ...,News,6,0.451714,What events have been canceled,canceled


In [18]:
reranking_results.drop(columns=["docid", "docno", "qid"])

Unnamed: 0,text,source_type,score,rerank_query,query,rank
1,aww i miss you!! I'll be home soon!! https://...,Twitter,1.000000,How many people are missing,missing,0
0,Missed my frees @ Gaslamp Quarter https://t.co...,Twitter,0.463001,How many people are missing,missing,1
3,Friday Night 7PM. @thegoodworth @gymstandard d...,Twitter,0.354585,How many people are missing,missing,2
48,The 30% containment figure means firefighters ...,News,1.000000,What is the fire containment level,containment,0
43,Within 30 min fire grew from 150 acres to 500 ...,Twitter,0.884257,What is the fire containment level,containment,1
...,...,...,...,...,...,...
184,@KrisKoivisto All happening at once is rare. G...,Twitter,0.946656,What events have been canceled,canceled,2
185,Los Alamitos Race Course announced that it is ...,News,0.845807,What events have been canceled,canceled,3
186,Los Alamitos Race Course canceled Fridays eigh...,News,0.619060,What events have been canceled,canceled,4
179,my teacher still wont cancel class https://t....,Twitter,0.565609,What events have been canceled,canceled,5


In [19]:
selection_results.sort_values("rank")

Unnamed: 0,docno,qids,queries,rerank_queries,text,scores,score,source_type,rank
1,document-1036,[query-3],[tree block road closures],[What roads are blocked / closed],"Accident, center lane blocked in #SanDiego on ...",[0.9867403172291037],1.0,Twitter,0
60,document-557,[query-3],[tree block road closures],[What roads are blocked / closed],Crash blocking the two left lanes in #SanDiego...,[0.8909598228593092],0.83682,Twitter,1
65,document-627,[query-3],[tree block road closures],[What roads are blocked / closed],A 3 car crash is blocking the left lane. in #N...,[0.8648129837780589],0.792274,Twitter,2
68,document-692,[query-3],[tree block road closures],[What roads are blocked / closed],"Accident, right lane blocked in #RanchoPenasqu...",[0.8296995408414153],0.732451,Twitter,3
34,document-1714,[query-3],[tree block road closures],[What roads are blocked / closed],Crash blocking the two right lanes in #Mission...,[0.7807792285249736],0.649106,Twitter,4
62,document-59,[query-2],[containment],[What is the fire containment level],Thomas fire in Ventura and Santa Barbara count...,[0.7254430631822885],0.554831,News,5
16,document-1321,[query-3],[tree block road closures],[What roads are blocked / closed],"Closures The fire, first reported at 11:20 a....",[0.7028954750370993],0.516417,News,6
18,document-1352,[query-3],[tree block road closures],[What roads are blocked / closed],"Accident, center lane blocked in #Lakeside on ...",[0.6993499661133604],0.510376,Twitter,7
58,document-486,[query-2],[containment],[What is the fire containment level],Cal Fire reports 2 structures destroyed 12 ot...,[0.6989716859897075],0.509732,Twitter,8
9,document-1180,[query-3],[tree block road closures],[What roads are blocked / closed],Region south of Camp Pendleton Eastern Fence ...,[0.6845908459211903],0.485231,News,9
