<a href="https://colab.research.google.com/github/sarahzhongg/IS4200-Final-Project/blob/main/reranking_IS4200.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# download libraries
import os
!apt-get install openjdk-21-jre-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-21-openjdk-amd64/bin/java
!java -version
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from google.colab import drive

openjdk version "21.0.6" 2025-01-21
OpenJDK Runtime Environment (build 21.0.6+7-Ubuntu-122.04.1)
OpenJDK 64-Bit Server VM (build 21.0.6+7-Ubuntu-122.04.1, mixed mode, sharing)


In [2]:
# read in wordcount pkl files
drive.mount('/content/drive', force_remount=True)

pkl_folder = '/content/drive/My Drive/pkl files'
pkl_files = [os.path.join(pkl_folder, f) for f in os.listdir(pkl_folder)]

for pkl in pkl_files:
  print(pkl)

Mounted at /content/drive
/content/drive/My Drive/pkl files/social_media_word_counts.pkl
/content/drive/My Drive/pkl files/metabolic_fasting_word_counts.pkl
/content/drive/My Drive/pkl files/urbanization_wildfire_word_counts.pkl
/content/drive/My Drive/pkl files/ethical_ai_query_word_counts.pkl
/content/drive/My Drive/pkl files/genetic_engineering_word_counts.pkl
/content/drive/My Drive/pkl files/smoking_query_word_counts.pkl
/content/drive/My Drive/pkl files/battery_word_counts.pkl
/content/drive/My Drive/pkl files/climate_change_word_counts.pkl


In [3]:
import pickle
path = '/content/drive/My Drive/pkl files'
topics = {'Effects of smoking on lung cancer': 'smoking_query_word_counts.pkl',
       'Ethical implications of AI': 'ethical_ai_query_word_counts.pkl',
       'Climate change and wildfires': 'climate_change_word_counts.pkl',
       'Effects of Fasting on Metabolic Health': 'metabolic_fasting_word_counts.pkl',
       'Psychological effects of social media on children and teens': 'social_media_word_counts.pkl',
       'Impacts of urbanization on wildlife biodiversity': 'urbanization_wildfire_word_counts.pkl',
       'Battery technology advancements for electric vehicles': 'battery_word_counts.pkl',
       'Genetic engineering as treatment for genetic disorders': 'genetic_engineering_word_counts.pkl'
       }

all_data = {}
# load query dict
for topic, filename in topics.items():
        full_path = os.path.join(path, filename)
        with open(full_path, "rb") as f:
          all_data[topic] = pickle.load(f)


In [4]:
len(all_data)

8

In [6]:
# get doc for a given topic and then get 30 more irrelevant from diff topic to make and set that relevant to 0
def get_query_corpus(all_data, query):
  # doc for given query
  query_corpus = all_data[query]

  # get other docs for a query
  other_docs = {}
  for q, docs in all_data.items():
    if q != query:
      other_docs.update(docs)

  # randomly sample 30 keys from other_docs
  sampled_other_docs = np.random.choice(list(other_docs.keys()), 30, replace=False)

  # get sampled docs and update query_corpus with irrelevant and repalce values
  for key in sampled_other_docs:
    irrelevant_doc = other_docs[key]
    irrelevant_doc['relevance'] = 0 # update relevance to 0
    query_corpus[key] = (irrelevant_doc) # update in query_corpus

  return query_corpus

In [7]:
# create baseline rerank function
# import sentence transformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# import sentence transformer
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

def reranking_model(query):
  # this reranking model ONLY encodes using sentence transformer to calculate cosine similarity between query and docs + feel free to update with using other keys like citations/references/etc.
  query_corpus = get_query_corpus(all_data, query)
  doc_ids = list(query_corpus.keys())

  # get document tokens as list and convery to str to pass into sentence transformer encoder (cant pass in counter dict)
  document_tokens = [doc['tokens'] for doc in query_corpus.values()]

  def counter_to_str(counter):
    return ' '.join([token for token, freq in counter.items() for _ in range(freq)])

  document_str = [counter_to_str(counter) for counter in document_tokens]

  # encode
  query_embedding = sentence_transformer.encode(query)
  document_embeddings = sentence_transformer.encode(document_str)

  # calculate cosine similarity
  cosine_similarities = cosine_similarity([query_embedding], document_embeddings)[0]

  # get reranked list and rank in decending
  reranked = list(zip([str(doc_id) for doc_id in doc_ids], [float(score) for score in cosine_similarities]))
  reranked.sort(key=lambda x: x[1], reverse=True)
  return reranked


reranking_model('Effects of smoking on lung cancer')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[('https://www.semanticscholar.org/paper/d1a4a81f2763d3a25a5ff4ca78713be79c60231b',
  0.618094801902771),
 ('https://www.semanticscholar.org/paper/73fe7c5b7635be9222e26a723384d3fc5717f02d',
  0.5018330812454224),
 ('https://www.semanticscholar.org/paper/76ded5364597a6b4abadd6a155f67b262c2c3f7c',
  0.46484869718551636),
 ('https://www.semanticscholar.org/paper/5cdc4a4dffe96963f6600c6bd422b2e0c8ae79c4',
  0.44873547554016113),
 ('https://www.semanticscholar.org/paper/7389668b7c32bcb662d42dbde25de03543f4ba6f',
  0.4280984401702881),
 ('https://www.semanticscholar.org/paper/dd81d8d9292d5da251299cc8c38c620a18125d1e',
  0.42106887698173523),
 ('https://www.semanticscholar.org/paper/0a6744c0aabf94d33399c22410f8a70e2742b5bd',
  0.42106887698173523),
 ('https://www.semanticscholar.org/paper/275749da6c8bfffd06c399b4cca4f8da3a57812a',
  0.3729900121688843),
 ('https://www.semanticscholar.org/paper/21c193ccdb94e6e54ee8ab03832282138b0684bb',
  0.3624398708343506),
 ('https://www.semanticscholar.org