In [1]:
import requests, urllib
import json, random, os

import numpy as np
import regex as re
import pandas as pd

from pathlib import Path
from difflib import SequenceMatcher
from textblob import TextBlob
from collections import Counter

from utils import *

In [2]:
queries = [q.strip() for q in open(Path("data/queries.txt")).readlines()]

In this notebook we run our queries through the IR system for a variety of settings, with the aim to compare annotated results.

### methods for calling IR system functionality

In [4]:
def run_queries(results_as_json, existing_csvs, prediction_type="combined_prediction", top_k=10):
    """
    Method to call the IR system's API (port 8000) with/without QE. Stores the results as a .json file, 
    then untangles the predictions that were grouped by document to grab the top_k results. These are
    the top retrieved passages for the query, which are then stored in a .csv file. 
    
    :return results_as_csv: Path to the specific csv file with the top_k results.
    """
    
    results_as_csv = results_as_json.parent.joinpath( results_as_json.stem + ".csv")
    
    if os.path.exists(results_as_json):
        with open(results_as_json, 'r') as f:
            all_query_results = json.load(f)
        QUERIES_READY = True
    else:
        QUERIES_READY = False
    
    # Actually run the queries
    if not QUERIES_READY: # only save query results if the provided json file doesn't exist
        all_query_results = {}
        for query in queries:
            query = query.encode('ascii', errors='ignore').decode()
            if query not in all_query_results.keys():
                try:
                    print(f"Query: {query}")
                    encoded_q = urllib.parse.quote(query)

                    if prediction_type == "expanded_prediction":
                        dict_returned_by_api = requests.post(f"http://localhost:8000/qe/{encoded_q}").json()
                    else:
                        dict_returned_by_api = requests.post(f"http://localhost:8000/q/{encoded_q}").json()
                        
                    all_query_results[query] = dict_returned_by_api
                except:
                    print("Somehow issue with query: ", query)
                    
                    
        with open(results_as_json, 'w') as f:
            json.dump(all_query_results, f)
            
    # Reformat query output (not grouped by document, but simply ranked list of passages)
    untangled, expanded_queries = untangle_combined_predictions(all_query_results, prediction_type)
    
    # Convert to .csv
    # top_k is the number of passsages I want to annotate
    column_indices, rows = grab_information_for_manual_judging(untangled, expanded_queries, top_k=top_k)
    df = pd.DataFrame(rows, columns=column_indices)
    
    # grab existing annotations
    print("Grabbing existing annotations")
#     df = find_existing_annotations(df, existing_csvs)
    df.to_csv(results_as_csv, index=False) 
    
    # quickly check how many rows pre-annotated now
    print(f"Nr. annotations found: {len([x[1]['relevance'] for x in df.iterrows() if x[1]['relevance']])}")
    return results_as_csv

In [5]:
def reset_settings():
    """
    Function to reset the settings of the IR system to whatever default you'd like it to be (change here).
    """
    base_settings = {
            'retrieval': {
                'ner_url': "http://spar:8501/",
                'classifier_url': "http://classifier:8502/",
                "top_k": 15
            },
            'indexing': {
                'indexing_type':  'sparse',
                'index_name':  'with_DE',
                'sparse_settings': {
                    'type': 'bm25'
                },
                'fields_and_weights': {
                    'content': 2,
                    'doc_title': 1,
                    'NER_labels': 0,
                    'filtered_NER_labels': 0,
                    'filtered_NER_labels_domains': 0,
                    'neighbours': 0,
                    'bm25_weight': 0
                }
            },
            'query_expansion': {
                'haystack_url': "http://haystack:8500/",
                'prf_weight': 0,
                'kg_weight': 0,
                'nn_weight': 0
            },
          "recreate_sparse_index": False,
          "recreate_dense_index": False
        }
    return base_settings

### Methods for processing result and reusing annotations

In [6]:
def untangle_combined_predictions(data, pred_type="combined_prediction"):
    """ 
    We combine the predictions of the IR system per document, which is useful for presenting results in a UI. But
    for annotation this combination obscures which results are ranked high; thus we untangle them for annotations.
    """
    untangled_data = {}
    expanded_queries = []
    for q in data:
        untangled_data[q] = []
        for doc_score, document in data[q][pred_type]:
            doc_title_retrieved = 0
            for retrieved_passage in document['contents']:
                if retrieved_passage['retrieved_by'] == ['doc_title']:
                    doc_title_retrieved = retrieved_passage['score']
                    continue
                retrieved_passage['score'] += doc_title_retrieved
                untangled_data[q].append(retrieved_passage)
        
        untangled_data[q] = sorted(untangled_data[q], key=lambda x: x['score'], reverse=True)
        if pred_type == "expanded_prediction":
            expanded_queries.append(data[q]["expanded_query"])
    return untangled_data, expanded_queries

In [7]:
def find_document_identifier(title):
    """ Grab the document identifier if we can """
    if not title:
        return ""
    match = None
    potential_id = ""
    if title.startswith("BS") or title.startswith("NA") or title.startswith("+") or \
        title.startswith("P") or title.startswith("D"):
        match = re.match(r"[A-Z +]+(to)?[A-Z ]+([\d\-+:]+)([A-Z ]+[\d\-+:]+)?", title)
    elif title[0].isdigit():
        match = re.match(r'[0-9: +]([\dA-Z\-+:]+)([A-Z ]+[\d\-+:]+)?')
    elif title.startswith("Eurocode"):
        match = re.match(r"Euro([\w :])+([\d\-+:]+)([A-Z ]+[\d\-+:]+)?", title)
        

    if match:
        end_idx = match.end()
        potential_id = title[:end_idx]

    return potential_id

In [8]:
def grab_information_for_manual_judging(untangled_dict, expanded_queries, top_k=9):
    """
    Function to grab the information we'd like to check/judge manually (columns in a CSV).
    """
    column_indices = ["relevance", 'query', 'rank', 'document_id', 'document_title', 'page_nr', 'text']
    rows = []
    
    if expanded_queries:
        column_indices.append("expanded query")
    
    for idx, query in enumerate(untangled_dict.keys()):     
        if expanded_queries:
            expanded_query = expanded_queries[idx]
        
        for rank, passage_dict in enumerate(untangled_dict[query][:top_k]):
            page_nr = passage_dict['id'].rsplit('##')[1]
            text = passage_dict['text']
            doc_title = passage_dict['doc_title']
            doc_id = find_document_identifier(doc_title)
            new_row = ["", query, rank+1, doc_id, doc_title, page_nr, text]
            
            if expanded_queries:
                new_row.append(expanded_query)

            try:
                rows.append(new_row)
            except IndexError:
                pass
                
    return column_indices, rows
        

In [10]:
def find_existing_annotations(to_be_annotated_df, existing_csv):
    """
    Helper function to reuse either a single CSV annotation file, or a list of them.
    """
    # annotated
    if type(existing_csv) == str:
        print(f"=== Grab annotations from {existing_csv} ")
        # update from annotations csv    
        to_be_annotated_df = reuse_existing_csv_annotations(to_be_annotated_df, existing_csv)
    elif type(existing_csv) == list:
        for old_csv in existing_csv:
            print(f"=== Grab annotations from {old_csv} ")
            # update from annotations csv    
            to_be_annotated_df = reuse_existing_csv_annotations(to_be_annotated_df, old_csv)
    return to_be_annotated_df
            

In [11]:
def reuse_existing_csv_annotations(to_be_annotated_df, existing_csv):
    """
    Re-use annotations found in previously annotated CSV files, so that the pandas DF to be annotated already is 
    pre-populated with those annotations.
    """
    # load csv to pandas DF
    annotations_df = pd.read_csv(existing_csv)
    # Add the relevance to specific queries and first 3 retrieved documents
    query = ""
    passage = ""
    doc_rank = 0
    new_rows = []
    
    for tba_row in to_be_annotated_df.iterrows():
        if not 'query' in tba_row[1]:
            continue
        
        query = tba_row[1]['query'].lower()
        context = tba_row[1]['text']
        if tba_row[1]['relevance'] in ['y', 'n']:
            new_rows.append(tba_row[1])
            continue
            
        for row in annotations_df.iterrows():
            existing_query = row[1]['query'].lower()
            existing_context = row[1]['text']
            query_match = sum([m.size for m in SequenceMatcher(None, query, existing_query).get_matching_blocks()])
            if query_match > .7 * len(query):
                result_match = sum([m.size for m in SequenceMatcher(None, context, existing_context).get_matching_blocks()])
                if result_match > .7 * len(context):
                    tba_row[1]['relevance'] = row[1]['relevance']
        new_rows.append(tba_row[1])
    
    return pd.DataFrame(new_rows)

### Metrics

In [12]:
def untangled_metrics(annotated_results, query_type=None):
    """ 
    Reads a csv file with annotations and returns metrics, also see utils.
    :input annotated_results: Filepath (string)
    """
    
    annotations = pd.read_csv(annotated_results)
    if query_type == None:
        query_type= "expanded_prediction" if "QE" in annotated_results else "combined_prediction"
    

    total_passages = 0
    true_positives = 0
    false_positives = 0
    total_queries = 0
    sum_of_F1s = 0
    sum_of_APs = 0
    sum_of_MRR = 0

    queries = []
    for row in annotations.iterrows():
        
        query = row[1]['query']
        if query not in queries:
            if queries:
                # convert previous bool_list into a score
                sum_of_F1s += F1_at_k(query_bool_list, len(query_bool_list))
                sum_of_MRR += MRR_query(query_bool_list)
                sum_of_APs += AP_query(query_bool_list)
            old_len = len(queries)
            queries.append(query)
            query_bool_list = []

        text = row[1]['text']
        passage_rank = row[1]['rank']
        relevant = True if row[1]['relevance'] in ['Yes', 'yes', 'y', 'Y'] else False
        
        irrelevant = True if row[1]['relevance'] in ['No', 'no', 'n', 'N'] else False
        
        if relevant:
            total_passages += 1
            # True positives
            true_positives += 1
            query_bool_list.append(True)
        elif irrelevant:
            total_passages += 1
            # False positives
            false_positives += 1
            query_bool_list.append(False)
            
    sum_of_F1s += F1_at_k(query_bool_list, len(query_bool_list))
    sum_of_MRR += MRR_query(query_bool_list)
    sum_of_APs += AP_query(query_bool_list)
    
    total_queries = len(queries)
    avgF1 = sum_of_F1s / total_queries
    MAP = sum_of_APs / total_queries
    MRR = sum_of_MRR / total_queries

    print(f"Total queries: {total_queries}, Total passages: {total_passages}, Total relevant: {true_positives}")

    print("avg F1: {:.2f}".format(avgF1 * 100))
    print("MRR: {:.2f} ".format(MRR * 100))
    print("MAP: {:.2f} ".format(MAP * 100))
    print(true_positives)
    avgf1_ = "{:.2f}".format(avgF1 * 100)
    print(avgf1_)
    mrr_ = "{:.2f}".format(MRR * 100)
    print(mrr_)
    map_ = "{:.2f}".format(MAP * 100)
    print(map_)


### Set experiments and Run

In [13]:
existing_csvs = []
def reset_existing_csvs():
    """ If you'd want to re-use existing annotations, then provide a list of the .csv files here. """
    return [] #[fp for fp in Path("data/annotated/").glob("R*.csv")]

In [14]:
# Set up our various IR experiment settings

top_results_per_retriever = 10
annotation_top_k = 3


json_directory = Path("data/results/")
spreadsheet_directory = Path("/Users/rubenkruiper/Google Drive/[0]_PostDoc/Annotation/")

to_run = ["sparse.json",      # regular
          "dense.json",
          "hybrid.json",
          "sparse_QE.json",   # with Query Expansion
          "dense_QE.json",
          "hybrid_QE.json",
          "sparse_DE.json",  # with Document Expansion
          "dense_DE.json",
          "hybrid_DE.json",
          "sparse_DE_QE.json",  # with Query and Document Expansion
          "dense_QE_DE.json",
          "hybrid_QE_DE.json"
         ]


run_settings = []
for name in to_run:
    base_settings = reset_settings()
    base_settings['indexing']['fields_and_weights']['doc_title'] = 1
    base_settings['retrieval']['top_k'] = top_results_per_retriever
    
    if "sparse" in name:
        base_settings['indexing']['indexing_type'] = 'sparse' 
        base_settings['indexing']['sparse_settings']['type'] = 'bm25f' if "DE" in name else "bm25"
        base_settings['indexing']['fields_and_weights']['bm25_weight'] = 1
        
    if "dense" in name:
        base_settings['indexing']['indexing_type'] = 'dense'
        base_settings['indexing']['fields_and_weights']['content'] = 2
        
    if "hybrid" in name:
        base_settings['indexing']['indexing_type'] = 'hybrid'
        base_settings['indexing']['sparse_settings']['sparse_type'] = 'bm25f' if "DE" in name else "bm25"
        base_settings['indexing']['fields_and_weights']['bm25_weight'] = 6
        base_settings['indexing']['fields_and_weights']['content'] = 2
        
    if "QE" in name:
        base_settings['query_expansion']['kg_weight'] = 1
        base_settings['query_expansion']['nn_weight'] = 2
        base_settings['query_expansion']['prf_weight'] = 0
        
    if "DE" in name:
        base_settings['indexing']['fields_and_weights']['NER_labels'] = 0
        base_settings['indexing']['fields_and_weights']['filtered_NER_labels'] = 1
        base_settings['indexing']['fields_and_weights']['filtered_NER_labels_domains'] = 1
        base_settings['indexing']['fields_and_weights']['neighbours'] = 1
        
        base_settings['retrieval']['top_k'] = 10
        
    run_settings.append(base_settings)

In [15]:
# example of a settings dict
run_settings[0]

{'retrieval': {'ner_url': 'http://spar:8501/',
  'classifier_url': 'http://classifier:8502/',
  'top_k': 10},
 'indexing': {'indexing_type': 'sparse',
  'index_name': 'with_DE',
  'sparse_settings': {'type': 'bm25'},
  'fields_and_weights': {'content': 2,
   'doc_title': 1,
   'NER_labels': 0,
   'filtered_NER_labels': 0,
   'filtered_NER_labels_domains': 0,
   'neighbours': 0,
   'bm25_weight': 1}},
 'query_expansion': {'haystack_url': 'http://haystack:8500/',
  'prf_weight': 0,
  'kg_weight': 0,
  'nn_weight': 0},
 'recreate_sparse_index': False,
 'recreate_dense_index': False}

In [16]:
# example of updating the system using a specific settings dict
requests.post(f"http://localhost:8000/update_weights/", json=run_settings[0]).json()

{'indexing_type': 'sparse',
 'sparse_type': 'bm25',
 'fields_and_weights': {'content': 2.0,
  'doc_title': 1.0,
  'NER_labels': 0.0,
  'filtered_NER_labels': 0.0,
  'filtered_NER_labels_domains': 0.0,
  'neighbours': 0.0,
  'bm25': 1.0,
  'top_k': 10},
 'top_k_per_retriever': 10,
 'prf_weight': 0.0,
 'kg_weight': 0.0,
 'nn_weight': 0.0}

In [17]:
# loop through all experiments, updating the settings and running all queries each time
for json_file_name, settings in zip(to_run, run_settings):
# for json_file_name, settings in zip(to_run[6:], run_settings[6:]):
    prediction_type = "expanded_prediction" if "QE" in json_file_name else "combined_prediction" 
    
    json_path = json_directory.joinpath(json_file_name)
    print("========= WORKING ON ===============     ",  json_path)
    print(f"prediction_type: {prediction_type}")
    print(f"annotating top K: {annotation_top_k}")
    
    # Update the query expansion settings
    updated_settings = requests.post(f"http://localhost:8000/update_weights/", json=settings).json()
    print(updated_settings)

    # run the queries and store annotation csv's
    existing_csvs = reset_existing_csvs()
    csv_file_name = run_queries(json_path, existing_csvs, prediction_type, top_k=annotation_top_k)
    

prediction_type: combined_prediction
annotating top K: 3
{'indexing_type': 'sparse', 'sparse_type': 'bm25', 'fields_and_weights': {'content': 2.0, 'doc_title': 1.0, 'NER_labels': 0.0, 'filtered_NER_labels': 0.0, 'filtered_NER_labels_domains': 0.0, 'neighbours': 0.0, 'bm25': 1.0, 'top_k': 10}, 'top_k_per_retriever': 10, 'prf_weight': 0.0, 'kg_weight': 0.0, 'nn_weight': 0.0}
Query: Rail-fixed external wall insulation systems
Query: Portable waste compactors
Query: Wood preservative and fire retardant treatment systems
Query: Adjustable struts
Grabbing existing annotations
Nr. annotations found: 0
prediction_type: combined_prediction
annotating top K: 3
{'indexing_type': 'dense', 'sparse_type': 'bm25', 'fields_and_weights': {'content': 2.0, 'doc_title': 1.0, 'NER_labels': 0.0, 'filtered_NER_labels': 0.0, 'filtered_NER_labels_domains': 0.0, 'neighbours': 0.0, 'bm25': 0.0, 'top_k': 10}, 'top_k_per_retriever': 10, 'prf_weight': 0.0, 'kg_weight': 0.0, 'nn_weight': 0.0}
Query: Rail-fixed exter