In [1]:
import pandas as pd
import numpy as np
import json
from pprint import pprint

In [2]:
def json_to_df(path):
  with open(path, 'r') as j:
      contents = json.loads(j.read())

  search_key, elapsed_time, publications, resources, others, results, result_lines, results_num = [], [], [], [], [], [], [], []
  for result in contents:
      # pprint(result)
      search_key.append(result['search_key'])
      elapsed_time.append(result['elapsed_time'])
      # results.append(result['results'])
      publications.append(result['stats']['publications'])
      if 'resources' in result['stats']:
          resources.append(result['stats']["resources"])
      else:
          resources.append(None)
      if 'others' in result['stats']:
          others.append(result['stats']["others"])
      else:
          others.append(None)
      result_lines= []
      num = 0
      for line in result['results']:
          result_lines.append(line)
          num += 1
      results.append(result_lines)
      results_num.append(num)

  df = pd.DataFrame([search_key, elapsed_time, publications, resources, others, results_num, results]).T
  df = df.rename(columns={0: 'search_key', 1: 'elapsed_time', 2: 'publications', 3: 'resources', 4: 'others', 5: 'results_num', 6: 'results'})
  return df, contents

df, contents = json_to_df("search_results.json")

In [3]:
from ast import literal_eval

print(df.shape)

df.head()

(275, 7)


Unnamed: 0,search_key,elapsed_time,publications,resources,others,results_num,results
0,Motorcycle detection and tracking in ADAS,4.850967,76,14,20,110,[\nMotorcycle detection for ADAS through camer...
1,Exploring Large Language Models for Multilingu...,4.803219,140,15,19,174,[\nExploring Large Language Models for Classic...
2,Review on ICT for education of children with s...,3.701676,82,5,19,106,[\nNORMLEX: Information System on Internationa...
3,Scalable Semantic Similarity Estimation Framew...,4.095878,83,9,16,108,[\nOntology Matching: A Machine Learning Appro...
4,Bag-of-Word and Bidirectional Attentive Memory...,3.68883,32,3,20,55,[\nComparison on Large Language Model augmente...


### Response time analysis

In [5]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2, column_widths=[0.4, 0.6], 
                    horizontal_spacing = 0.04,  subplot_titles=("Retrieved Documents Numbers Distribution.", "Response Time Distribution"))

trace0 = go.Histogram(x=df['results_num'].tolist(),  showlegend=False)
trace1 = go.Histogram(x=df['elapsed_time'].tolist(),  showlegend=False)


fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)

fig.update_layout(height=600, width=1700, margin=dict(l=20, r=20, t=20, b=20))

fig.write_image("response_time_results.pdf", width=1300, height=520)

### Relevance analysis

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import recall_score
import nltk

In [8]:
def get_threshold_recall(similarities, threshold):
    recalls = []
    for sim in similarities:
      filtered_sum = 0
      for val in sim:
        if val >= threshold:
          filtered_sum += 1

      recall = filtered_sum/len(sim)
      recalls.append(recall)
    return sum(recalls)/len(recalls)


In [9]:
def get_tfidf_cosine_scores(search_keys, documents):
  similarities = []
  # Calculate tfidf vectors
  for key,res_texts in zip(search_keys, documents):
    tfidf_vectorizer = TfidfVectorizer()
    query_tfidf = tfidf_vectorizer.fit_transform([key]).toarray()
    document_tfidf = tfidf_vectorizer.transform(res_texts).toarray()

    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(query_tfidf, document_tfidf)
    similarities.append(cosine_similarities[0])
  return similarities

df['tfidf_sim'] = get_tfidf_cosine_scores(df['search_key'], df['results'])

In [None]:
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

model = SentenceTransformer("all-MiniLM-L6-v2")

def get_bert_simlirarities(search_keys, documents):
    similarities = []
    for key,res_texts in tqdm(zip(search_keys, documents)):
      # Compute embedding for both lists
      embeddings1 = model.encode([res.lower() for res in res_texts], convert_to_tensor=True)
      embeddings2 = model.encode([key.lower()], convert_to_tensor=True)

      # Compute cosine-similarities
      cosine_scores = util.cos_sim(embeddings1, embeddings2)
      similarities.append(cosine_scores.reshape(-1).tolist())

    return similarities

df['bert_sim'] = get_bert_simlirarities(df['search_key'], df['results'])

In [None]:
from rank_bm25 import BM25Okapi

def get_bm25_similarities(search_keys, documents):
  similarities = []
  for key,res_texts in tqdm(zip(search_keys, documents)):
    # corpus = " ".join(res_texts)
    corpus = [res.lower().split(" ") for res in res_texts]
    bm25 = BM25Okapi(corpus)

    # Calculate BM25 scores for each search key
    bm25_scores = []
    bm25_scores = bm25.get_scores(key.lower().split(" "))
    # bm25_scores.append(scores)

    # Convert BM25 scores to cosine similarities
    max_score = np.max(bm25_scores)
    cosine_similarities = [np.array(scores) / max_score for scores in bm25_scores]
    similarities.append(cosine_similarities)
    # similarities.append(bm25_scores)
  return similarities

df['bm25_sim'] = get_bm25_similarities(df['search_key'], df['results'])

In [11]:
import plotly.graph_objects as go

thresholds = [th/100 for th in range(1, 100)]

tfidf_plot_vals = []
bert_plot_vals = []
bm25_plot_vals = []
for t in thresholds:
    tfidf_plot_vals.append(get_threshold_recall(df['tfidf_sim'], t))
    bert_plot_vals.append(get_threshold_recall(df['bert_sim'], t))
    bm25_plot_vals.append(get_threshold_recall(df['bm25_sim'], t))

f1 = go.Figure(
    data = [
        go.Scatter(x=thresholds, y=tfidf_plot_vals, name="TFIDF"),
        go.Scatter(x=thresholds, y=bert_plot_vals, name="BERT"),
        go.Scatter(x=thresholds, y=bm25_plot_vals, name="BM25"),
    ],
    layout = {"xaxis": {"title": "Thresholds"}, "yaxis": {"title": "Average Recall"}}
)

f1.write_image("relevancy.pdf", width=1300, height=520, scale=5)

In [14]:
# df.to_csv("search_results.csv", index=False)