In [1]:
!pip install rank_bm25



In [2]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

pkl_folder = '/content/drive/My Drive/pkl files'
pkl_files = [os.path.join(pkl_folder, f) for f in os.listdir(pkl_folder)]

print(pkl_files)

Mounted at /content/drive
['/content/drive/My Drive/pkl files/battery_word_counts.pkl', '/content/drive/My Drive/pkl files/urbanization_wildfire_word_counts.pkl', '/content/drive/My Drive/pkl files/genetic_engineering_word_counts.pkl', '/content/drive/My Drive/pkl files/ethical_ai_query_word_counts.pkl', '/content/drive/My Drive/pkl files/social_media_word_counts.pkl', '/content/drive/My Drive/pkl files/smoking_query_word_counts.pkl', '/content/drive/My Drive/pkl files/metabolic_fasting_word_counts.pkl', '/content/drive/My Drive/pkl files/climate_change_word_counts.pkl']


In [3]:
for pkl in pkl_files:
  print(pkl)

/content/drive/My Drive/pkl files/battery_word_counts.pkl
/content/drive/My Drive/pkl files/urbanization_wildfire_word_counts.pkl
/content/drive/My Drive/pkl files/genetic_engineering_word_counts.pkl
/content/drive/My Drive/pkl files/ethical_ai_query_word_counts.pkl
/content/drive/My Drive/pkl files/social_media_word_counts.pkl
/content/drive/My Drive/pkl files/smoking_query_word_counts.pkl
/content/drive/My Drive/pkl files/metabolic_fasting_word_counts.pkl
/content/drive/My Drive/pkl files/climate_change_word_counts.pkl


In [4]:
import pickle
path = '/content/drive/My Drive/pkl files/'
map = {'Effects of smoking on lung cancer': 'smoking_query_word_counts.pkl',
       'Ethical implications of AI': 'ethical_ai_query_word_counts.pkl',
       'Climate change and wildfires': 'climate_change_word_counts.pkl',
      #  'Benefits of a healthy gut': '',
      #  'How exercise affects mental health': '',
       'Effects of Fasting on Metabolic Health': 'metabolic_fasting_word_counts.pkl',
       'Psychological effects of social media on children and teens': 'social_media_word_counts.pkl',
       'Impacts of urbanization on wildlife biodiversity': 'urbanization_wildfire_word_counts.pkl',
       'Battery technology advancements for electric vehicles': 'battery_word_counts.pkl',
       'Genetic engineering as treatment for genetic disorders': 'genetic_engineering_word_counts.pkl'
       }

all_data = {}
# load query dict
for query, pkl in map.items():
  with open(path + pkl, "rb") as f:
    all_data[query] = pickle.load(f)

In [5]:
print(all_data['Climate change and wildfires'][list(all_data['Climate change and wildfires'].keys())[1]]['tokens'])

Counter({'and': 279, 'the': 238, 'of': 176, 'to': 136, 'fire': 135, 'stream': 125, 'in': 121, 'temperature': 106, 'a': 95, 'for': 86, 'climate': 76, 'on': 76, 'et': 59, 'al': 58, 'at': 53, 'management': 51, 'change': 48, 'with': 46, 'model': 39, 'temperatures': 38, 'across': 38, 'effects': 36, 'from': 35, 'that': 35, 'scale': 31, '1': 31, 'by': 31, 'was': 30, 'wildfire': 28, 'radiation': 28, 'We': 27, 'spatial': 27, 'were': 27, 'air': 26, 'J': 26, 'is': 25, '2011': 25, '2': 24, 'as': 23, '2014': 22, 'landscape': 22, 'we': 22, 'solar': 22, 'Climate': 22, 'scales': 21, 'Fire': 21, 'increases': 20, 'changes': 20, 'The': 19, '2010': 19, '°C': 19, 'severity': 19, 'riparian': 18, 'Change': 18, 'all': 18, 'A2': 18, 'S': 18, 'climates': 17, 'each': 17, '2013': 16, 'are': 16, 'warming': 16, 'but': 16, 'conditions': 16, 'Climatic': 16, '124:191–206': 16, 'Online': 16, 'Resource': 16, 'Isaak': 15, 'fires': 15, 'our': 15, 'year': 15, 'vegetation': 14, 'FireBGCv2': 14, 'fuel': 14, 'precipitation': 

In [6]:
import nltk
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
def tokenize_query(query):
    tokens = word_tokenize(query.lower())  # lowercase + tokenize
    tokens = [t for t in tokens if t not in string.punctuation]
    return tokens

tokenized_queries = [tokenize_query(query) for query in list(all_data.keys())]

In [8]:
print(len([doc['tokens'] for doc in list(all_data['Genetic engineering as treatment for genetic disorders'].values())]))

60


In [9]:
from collections import Counter

def counter_to_token_list(counter):
    tokens = []
    for token, count in counter.items():
        tokens.extend([token] * count)
    return tokens

corpus = []
for query, data in all_data.items():
  corpus.extend([counter_to_token_list(tokens) for tokens in [doc['tokens'] for doc in list(data.values())]])

print(len(corpus))

480


In [10]:
from rank_bm25 import BM25Okapi

# Initialize BM25 with your corpus
bm25 = BM25Okapi(corpus)

# Score documents for a query
for query in tokenized_queries:
    scores = bm25.get_scores(query)
    ranked_doc_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    # Use these ranked indices as your baseline ranking


In [11]:
relevance_scores = []
for query, data in all_data.items():
  relevance_scores.append([val['relevance'] for val in list(data.values())])
for i, score in enumerate(relevance_scores):
  relevance_scores[i] = ([0] * (60 * i)) + relevance_scores[i] + ([0] * (60 * (len(tokenized_queries) - (i + 1))))

In [26]:
from sklearn.metrics import average_precision_score
import numpy as np

def precision_at_k(true, pred, k):
    true = np.array(true) > 0
    pred = np.array(pred)[:k]
    return np.sum(true[pred]) / k

def recall_at_k(true, pred, k):
    true = np.array(true) > 0
    pred = np.array(pred)[:k]
    return np.sum(true[pred]) / np.sum(true)

def dcg(rels):
    return sum(rel / np.log2(idx + 2) for idx, rel in enumerate(rels))

def ndcg_at_k(true, pred, k):
    true = np.array(true)
    ideal = sorted(true, reverse=True)[:k]
    pred_rels = true[pred[:k]]
    return dcg(pred_rels) / (dcg(ideal) + 1e-8)


# Evaluate each query
for i, query in enumerate(tokenized_queries):
    scores = bm25.get_scores(query)
    ranked = np.argsort(scores)[::-1]  # descending
    rels = relevance_scores[i]

    print(f"\nQuery {i+1}: {list(map.keys())[i]}")
    # print(f"Precision@30: {precision_at_k(rels, ranked, 30):.2f}") <- left this out because they're all 1.00
    print(f"Recall@60:    {recall_at_k(rels, ranked, 60):.2f}")
    print(f"NDCG@30:      {ndcg_at_k(rels, ranked, 30):.2f}")
    # print(f"MAP:          {average_precision_score((np.array(rels) > 0).astype(int), scores):.2f}") #<- these values are also really high


Query 1: Effects of smoking on lung cancer
Recall@60:    0.95
NDCG@30:      0.81

Query 2: Ethical implications of AI
Recall@60:    0.95
NDCG@30:      0.70

Query 3: Climate change and wildfires
Recall@60:    0.93
NDCG@30:      0.83

Query 4: Effects of Fasting on Metabolic Health
Recall@60:    0.93
NDCG@30:      0.84

Query 5: Psychological effects of social media on children and teens
Recall@60:    0.83
NDCG@30:      0.75

Query 6: Impacts of urbanization on wildlife biodiversity
Recall@60:    0.98
NDCG@30:      0.84

Query 7: Battery technology advancements for electric vehicles
Recall@60:    1.00
NDCG@30:      0.85

Query 8: Genetic engineering as treatment for genetic disorders
Recall@60:    0.88
NDCG@30:      0.80
