In [None]:
pip install -q rank-bm25
pip install -q sentence_transformers

In [None]:
import pandas as pd
import numpy as np
import json
from pprint import pprint

In [None]:
def json_to_df(path):
  with open(path, 'r') as j:
      contents = json.loads(j.read())

  search_key, elapsed_time, publications, resources, others, results, result_lines, results_num = [], [], [], [], [], [], [], []
  for result in contents:
      # pprint(result)
      search_key.append(result['search_key'])
      elapsed_time.append(result['elapsed_time'])
      # results.append(result['results'])
      publications.append(result['stats']['publications'])
      if 'resources' in result['stats']:
          resources.append(result['stats']["resources"])
      else:
          resources.append(None)
      if 'others' in result['stats']:
          others.append(result['stats']["others"])
      else:
          others.append(None)
      result_lines= []
      num = 0
      for line in result['results']:
          result_lines.append(line)
          num += 1
      results.append(result_lines)
      results_num.append(num)

  df = pd.DataFrame([search_key, elapsed_time, publications, resources, others, results_num, results]).T
  df = df.rename(columns={0: 'search_key', 1: 'elapsed_time', 2: 'publications', 3: 'resources', 4: 'others', 5: 'results_num', 6: 'results'})
  return df, contents

df, contents = json_to_df("/export/home/0usmanov/data/search_results.json")

In [None]:
import re
import nltk
from nltk.tokenize import WordPunctTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords

def cleanDoc(doc):
    stopset = set(stopwords.words('english'))
    #stemmer = nltk.PorterStemmer()
    tokens = WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token)>2]
    #final = [stemmer.stem(word) for word in clean]
    return clean

def remove_urls(text, replacement_text=""):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text_without_urls = url_pattern.sub(replacement_text, text)
    return text_without_urls

def clean_text(contents)
  clean = []
  for c in contents:
    r_tokens = []
    for r in c['results']:
      url_removed = remove_urls(r)
      r_tokens.append(' '.join(cleanDoc(url_removed.rstrip())))#.apply(lambda x: ' '.join(map(str, x))))
    clean.append(r_tokens)
  return clean

#df['tokens'] = tokens
df['clean'] = clean_text(contents)

In [None]:
from ast import literal_eval
#df = pd.read_csv("search_res3.csv", converters={'clean': literal_eval, 'clean_stem': literal_eval, 'bert_sim': literal_eval, 'tfidf_sim': literal_eval, 'bm25_sim': literal_eval})

print(df.shape)
df = df[df['results_num'] < 300]
print(df.shape)
df.head()

### Response time analysis

In [None]:
import plotly.express as px

def plot_bar(df, col1, col2):
  fig = px.bar(df, x="num", y="type", 
              orientation='h',
              height=400,
              title='Number of results per source')
  fig.show()

def get_hist(df, column, title):
  import plotly.express as px

  fig = px.histogram(df,
       x=column,
       marginal='box',
       title=title
       )

  fig.add_vline(x=2.2, line_width=1, line_dash='dash', line_color='gray', col=1)

  fig.show()

In [None]:
plot_df = df[['publications','resources', 'others']].sum().reset_index().rename(columns={'index': 'type', 0: 'num'})
plot_bar(plot_df, "num", "type")

In [None]:
get_hist(df, 'results_num', 'Results number distribution')

In [None]:
get_hist(df, 'elapsed_time', 'Response time distribution')

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

fig = make_subplots(rows=3, cols=1, specs=[[{"type": "bar"}], [{"type": "xy"}], [{"type": "xy"}]],
            subplot_titles=('Number of results per source', 'Results number distribution', 'Response time distribution'))

#res_num_plot = filtered_df[['publications','resources', 'others']].sum().reset_index().rename(columns={'index': 'type', 0: 'num'})
plot1 = px.bar(plot_df, x="num", y="type", orientation='h')
plot2 = px.histogram(df, x='results_num', marginal='box')
plot2.add_vline(x=2.2, line_width=1, line_dash='dash', line_color='gray', col=1)
plot3 = px.histogram(df, x='elapsed_time', marginal='box')
plot3.add_vline(x=2.2, line_width=1, line_dash='dash', line_color='gray', col=1)

fig.append_trace(plot1.data[0],row=1, col=1)
#fig.append_trace(plot2.data[1], row=2, col=1)
fig.append_trace(plot2.data[0], row=2, col=1)
#fig.append_trace(plot3.data[1], row=3, col=1)
fig.append_trace(plot3.data[0], row=2, col=1)


fig.update_layout(height=800, width=600)#, title_text="Stacked Subplots")
fig.show()

### Relevance analysis

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import recall_score
import nltk

In [None]:
def get_threshold_recall(similarities, threshold):
    recalls = []
    for sim in similarities:
      filtered_sum = 0
      for val in sim:
        if val >= threshold:
          filtered_sum += 1

      recall = filtered_sum/len(sim)
      recalls.append(recall)
    return sum(recalls)/len(recalls)


In [None]:
def get_tfidf_cosine_scores(search_keys, documents):
  similarities = []
  # Calculate tfidf vectors
  for key,res_texts in zip(search_keys, documents):
    tfidf_vectorizer = TfidfVectorizer()
    query_tfidf = tfidf_vectorizer.fit_transform([key]).toarray()
    document_tfidf = tfidf_vectorizer.transform(res_texts).toarray()

    # Calculate cosine similarity
    cosine_similarities = cosine_similarity(query_tfidf, document_tfidf)
    similarities.append(cosine_similarities[0])
  return similarities

df['tfidf_sim'] = get_tfidf_cosine_scores(df['search_key'], df['clean'])

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

def get_bert_simlirarities(search_keys, documents):
    similarities = []
    for key,res_texts in zip(search_keys, documents):
      # Compute embedding for both lists
      embeddings1 = model.encode(res_texts, convert_to_tensor=True)
      embeddings2 = model.encode([key], convert_to_tensor=True)

      # Compute cosine-similarities
      cosine_scores = util.cos_sim(embeddings1, embeddings2)
      similarities.append(cosine_scores.reshape(-1).tolist())

    return similarities

df['bert_sim'] = get_bert_simlirarities(df['search_key'], df['clean'])

In [None]:
from rank_bm25 import BM25Okapi

def get_bm25_similarities(search_keys, documents):
  similarities = []
  for key,res_texts in zip(search_keys, documents):
    corpus = " ".join(res_texts)
    bm25 = BM25Okapi(corpus)

    # Calculate BM25 scores for each search key
    bm25_scores = []
    scores = bm25.get_scores(key)
    bm25_scores.append(scores)

    # Convert BM25 scores to cosine similarities
    max_score = np.max(bm25_scores)
    cosine_similarities = [np.array(scores) / max_score for scores in bm25_scores]
    similarities.append(cosine_similarities[0])
  return similarities

df['bm25_sim'] = get_bm25_similarities(df['search_key'], df['clean'])

In [None]:
import plotly.graph_objects as go

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
tfidf_plot_vals = []
bert_plot_vals = []
bm25_plot_vals = []
for t in thresholds:
  tfidf_plot_vals.append(get_threshold_recall(df['tfidf_sim'], t))
  bert_plot_vals.append(get_threshold_recall(df['bert'], t))
  bm25_plot_vals.append(get_threshold_recall(df['bm25_sim'], t))

f1 = go.Figure(
    data = [
        go.Scatter(x=thresholds, y=tfidf_plot_vals, name="TFIDF"),
        go.Scatter(x=thresholds, y=bert_plot_vals, name="BERT"),
        go.Scatter(x=thresholds, y=bm25_plot_vals, name="BM25"),
    ],
    layout = {"xaxis": {"title": "Thresholds"}, "yaxis": {"title": "Average recall for similarity scores"}, "title": "Recall analysis"}
)

f1