# Merge BM25 and Siamese search results

## Google Colab setups

This part only gets executed if this notebook is being run under Google Colab. **Please change the working path  directory below in advance!**

In [1]:
# Use Google Colab
use_colab = True

# Is this notebook running on Colab?
# If so, then google.colab package (github.com/googlecolab/colabtools)
# should be available in this environment

# Previous version used importlib, but we could do the same thing with
# just attempting to import google.colab
try:
    from google.colab import drive
    colab_available = True
except:
    colab_available = False

if use_colab and colab_available:
    # If there are packages I need to install separately, do it here
    #!pip install pyserini==0.9.4.0 jsonlines==1.2.0

    # Mount Google Drive
    drive.mount('/content/drive')

    # cd to the appropriate working directory under my Google Drive
    # (IMPORTANT: THIS PATH MUST MATCH EXACTLY TO WHERE THIS NOTEBOOK IS LOCATED
    # IN YOUR GOOGLE DRIVE!!)
    %cd '/content/drive/My Drive/CS646_Final_Project/merge'

    # List the directory contents
    !ls

## Import packages

In [16]:
import os
import json
import random
import pathlib

from tqdm import tqdm

## Paths

In [4]:
siamese_result_restaurant_path = os.path.join(
    '..', 'siamese', 'query_results',
    'sbert_bert_ada_joint_online_contrastive', 'top_800',
    'test_results_restaurant_sbert_bert_ada_joint_online_contrastive.txt')

siamese_result_laptop_path = os.path.join(
    '..', 'siamese', 'query_results',
    'sbert_bert_ada_joint_online_contrastive', 'top_800',
    'test_results_laptop_sbert_bert_ada_joint_online_contrastive.txt')

bm25_result_restaurant_path = os.path.join('..', 'bm25', 'test_results_restaurant_bm25.txt')

bm25_result_laptop_path = os.path.join('..', 'bm25', 'test_results_laptop_bm25.txt')

In [None]:
avg_laptop_result_path = os.path.join('query_results', 'test_results_laptop_merge.txt')

avg_restaurant_result_path = os.path.join('query_results', 'test_results_restaurant_merge.txt')

## Combine two search results

In [10]:
merged_query_docs_rest = {}
merged_query_docs_lap = {}

### Restaurant

In [11]:
# Read BM25 query results
with open(bm25_result_restaurant_path) as f:
    for line in f:
        line = line.split(' ')

        qid = line[0] # Query ID
        docid = line[2] # doc ID
        score = float(line[4]) # Score assigned to this doc

        if qid not in merged_query_docs_rest:
            merged_query_docs_rest[qid] = {
                'docid': [],
                'score': []
            }

        if docid not in merged_query_docs_rest[qid]['docid']:
            merged_query_docs_rest[qid]['docid'].append(docid)
            merged_query_docs_rest[qid]['score'].append(score)

# Normalize bm25
for qid in merged_query_docs_rest.keys():
    max_ = max(merged_query_docs_rest[qid]['score'])
    min_ = min(merged_query_docs_rest[qid]['score'])

    if len(merged_query_docs_rest[qid]['score']) <= 1:
        # merged_query_docs_rest[qid]['score'][0] = 0.5
        merged_query_docs_rest[qid]['score'][0] = 1
        continue

    for i, score in enumerate(merged_query_docs_rest[qid]['score']):
        merged_query_docs_rest[qid]['score'][i] = str((score-min_)/(max_- min_))

# Read through the Siamese results
with open(siamese_result_restaurant_path) as f:
    for line in f:
        line = line.split(' ')

        qid = line[0]
        docid = line[2]
        score = line[4]

        if qid not in merged_query_docs_rest:
            merged_query_docs_rest[qid] = {
                'docid': [],
                'score': []
            }

        #if docid not in merged_query_docs_rest[qid]['docid']:
        #    merged_query_docs_rest[qid]['docid'].append(docid)
        #    merged_query_docs_rest[qid]['score'].append(score)
        #else:
        if docid in merged_query_docs_rest[qid]['docid']:
            ind = merged_query_docs_rest[qid]['docid'].index(docid)
            prev_score = float(merged_query_docs_rest[qid]['score'][ind])
            merged_query_docs_rest[qid]['score'][ind] = str((prev_score + float(score)) / 2)

### Laptop

In [13]:
# Read BM25 query results
with open(bm25_result_laptop_path) as f:
    for line in f:
        line = line.split(' ')

        qid = line[0] # Query ID
        docid = line[2] # doc ID
        score = float(line[4]) # Score assigned to this doc

        if qid not in merged_query_docs_lap:
            merged_query_docs_lap[qid] = {
                'docid': [],
                'score': []
            }

        if docid not in merged_query_docs_lap[qid]['docid']:
            merged_query_docs_lap[qid]['docid'].append(docid)
            merged_query_docs_lap[qid]['score'].append(score)

# Normalize bm25
for qid in merged_query_docs_lap.keys():
    max_ = max(merged_query_docs_lap[qid]['score'])
    min_ = min(merged_query_docs_lap[qid]['score'])

    if len(merged_query_docs_lap[qid]['score']) <= 1:
        # merged_query_docs_rest[qid]['score'][0] = 0.5
        merged_query_docs_lap[qid]['score'][0] = 1
        continue

    for i, score in enumerate(merged_query_docs_lap[qid]['score']):
        merged_query_docs_lap[qid]['score'][i] = str((score-min_)/(max_- min_))

# Read through the Siamese results
with open(siamese_result_laptop_path) as f:
    for line in f:
        line = line.split(' ')

        qid = line[0]
        docid = line[2]
        score = line[4]

        if qid not in merged_query_docs_lap:
            merged_query_docs_laptop[qid] = {
                'docid': [],
                'score': []
            }

        #if docid not in merged_query_docs_rest[qid]['docid']:
        #    merged_query_docs_rest[qid]['docid'].append(docid)
        #    merged_query_docs_rest[qid]['score'].append(score)
        #else:
        if docid in merged_query_docs_lap[qid]['docid']:
            ind = merged_query_docs_lap[qid]['docid'].index(docid)
            prev_score = float(merged_query_docs_lap[qid]['score'][ind])
            merged_query_docs_lap[qid]['score'][ind] = str((prev_score + float(score)) / 2)

## Produce merged query results txt

### Restaurant

In [19]:
print(avg_restaurant_result_path)

if os.path.exists(avg_restaurant_result_path):
    !rm -r {avg_restaurant_result_path}
else:
    pathlib.Path(avg_restaurant_result_path).parent.mkdir(parents=True, exist_ok=True)
    !touch {avg_restaurant_result_path}

for i, qid in tqdm(enumerate(merged_query_docs_rest.keys())):
    docs = merged_query_docs_rest[qid]['docid']
    scores = merged_query_docs_rest[qid]['score']
    doc_sorted = sorted(zip(docs,scores), key=lambda x: x[1], reverse=True)

    for j,doc in enumerate(doc_sorted):
        docid = doc[0]
        score = doc[1]

        line = str(i+1) + ' Q0 ' + docid + ' ' + str(j+1) + ' ' + str(score) + ' merge'

        with open(avg_restaurant_result_path, 'a') as f:
            f.write("%s\n" % line)

merged_results/merge_results_avg_restaurant.txt


642it [00:01, 448.87it/s]


### Laptop

In [17]:
print(avg_laptop_result_path)

if os.path.exists(avg_laptop_result_path):
    !rm -r {avg_laptop_result_path}
else:
    pathlib.Path(avg_laptop_result_path).parent.mkdir(parents=True, exist_ok=True)
    !touch {avg_laptop_result_path}

for i, qid in tqdm(enumerate(merged_query_docs_lap.keys())):
    docs = merged_query_docs_lap[qid]['docid']
    scores = merged_query_docs_lap[qid]['score']
    doc_sorted = sorted(zip(docs,scores), key=lambda x: x[1], reverse=True)

    for j,doc in enumerate(doc_sorted):
        docid = doc[0]
        score = doc[1]

        line = str(i+1) + ' Q0 ' + docid + ' ' + str(j+1) + ' ' + str(score) + ' merge'

        with open(avg_laptop_result_path, 'a') as f:
            f.write("%s\n" % line)

merged_results/merge_results_avg_laptop.txt


475it [00:00, 682.48it/s]
