In [17]:

import datetime
from state_of_the_art.tables.paper_table import PaperTable

from datetime import datetime

date_from = datetime.strptime('2024-01-01', '%Y-%m-%d').date()
date_to = datetime.strptime('2025-03-31', '%Y-%m-%d').date()

papers_df = PaperTable().read()
papers_df = papers_df[(papers_df["published"].dt.date >= date_from) & (papers_df["published"].dt.date <= date_to)]

# put title in the first column, keep the rest of the columns in the same order do not remove any columns   
papers_df = papers_df[["title"] + [col for col in papers_df.columns if col != "title"]]

papers_df.describe()

Unnamed: 0,tdw_timestamp
count,59315
mean,2024-09-12 07:13:30.671037184
min,2024-05-24 07:21:40.564173
25%,2024-07-02 23:15:33.793488640
50%,2024-08-09 18:35:19.699799040
75%,2024-11-12 06:05:38.147093504
max,2025-04-04 07:26:42.140125


In [18]:
# load topics

from state_of_the_art.tables.interest_table import InterestTable
topics_df = InterestTable(auth_filter=False).read_sorted_by_position()
topics_df.describe()



Unnamed: 0,tdw_timestamp,position
count,47,47.0
mean,2024-10-26 20:04:20.391029248,3.148936
min,2024-08-10 06:50:57.124898,-4.0
25%,2024-09-04 01:10:08.359028992,0.0
50%,2024-10-04 12:43:31.047667968,0.0
75%,2024-12-29 17:51:46.536517120,3.5
max,2025-04-06 14:40:28.019897,20.0
std,,6.577229


In [20]:
# add bm25

import pandas as pd
import nltk
from rank_bm25 import BM25Okapi as BM25
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
lemmatizer = nltk.stem.WordNetLemmatizer()

def tokenize(string):
    tokens = tokenizer.tokenize(string)
    lemmas = [lemmatizer.lemmatize(t) for t in tokens]
    return lemmas


tokenized_corpus = [

    tokenize(str(paper.get('title', '')) + " " + str(paper.get('abstract', '')))
    for index, paper in papers_df.iterrows()
]


n = papers_df.shape[0]
bm25 = BM25(tokenized_corpus)
final_result = papers_df.copy()

print(f"Processing {topics_df.shape[0]} topics")
for index, topic in topics_df.iterrows():
    print(f"Processing topic {index} of {topics_df.shape[0]}: {topic.get('name', '')}")
    query = topic.get("name", "") + ' ' + topic.get("description", "")
    tokenized_query = tokenize(query)
    column_name = 'bm25_' + topic.get("name", "").replace(" ", "_").lower()

    result = bm25.get_top_n(tokenized_query, papers_df.to_dict(orient='records'), n=n)
    scores = sorted(bm25.get_scores(tokenized_query)[0:n], reverse=True)
    # pandas from dict
    bm25_df = pd.DataFrame(result)
    bm25_df = bm25_df[["abstract_url"]]
    bm25_df[column_name] = scores

    # join the bm25_df with the papers_df based on abstract_url
    final_result = pd.merge(final_result, bm25_df, on='abstract_url', how='left')


pd.set_option('display.max_colwidth', 100)

final_result

Processing 47 topics
Processing topic 0 of 47: sustaineable tourism
Processing topic 0 of 47: building objective functions in deep learning efficiently best practices
Processing topic 0 of 47: ab testing 
Processing topic 0 of 47: ab testing 
Processing topic 0 of 47: crm emails machine learning optimization
Processing topic 0 of 47: ethics in machine learning
Processing topic 0 of 47: measuring effectively the effects of machine learning products
Processing topic 0 of 47: education
Processing topic 0 of 47: measuring business impact successfully
Processing topic 0 of 47: images deep learning
Processing topic 16 of 47: ML for combating poverty
Processing topic 12 of 47: marketing machine learning 
Processing topic 36 of 47: education to combat poverty
Processing topic 20 of 47: deep learning
Processing topic 19 of 47: data science management
Processing topic 35 of 47: deep learning with videos
Processing topic 34 of 47: recommender systems
Processing topic 27 of 47: clv
Processing topi

Unnamed: 0,title,abstract_url,published,abstract,tdw_timestamp,tdw_uuid,pdf_url,institution,bm25_sustaineable_tourism,bm25_building_objective_functions_in_deep_learning_efficiently_best_practices,...,bm25_evaluation_metrics,bm25_mlops,bm25_ml_lifecycle,bm25_ethics_y,bm25_ml_code_quality_presentation_with_meghana,bm25_attribution,bm25_how_to_measure_clv_longer_term,bm25_media_mix_modelling_marketing,bm25_brazil_x,bm25_brazil_y
0,LLaVA-OneVision: Easy Visual Task Transfer,https://arxiv.org/abs/2408.03326,2024-08-06 17:59:44+00:00,"We present LLaVA-OneVision, a family of open large multimodal models (LMMs)\ndeveloped by consol...",2024-08-07 08:20:13.367967,2f4446c6-4e30-4060-b880-ed140458bf17,,,0.0,5.462774,...,25.396307,1.403658,1.403658,9.127051,11.507577,4.563525,16.738895,0.000000,0.0,0.0
1,CoverBench: A Challenging Benchmark for Complex Claim Verification,https://arxiv.org/abs/2408.03325,2024-08-06 17:58:53+00:00,There is a growing line of research on verifying the correctness of language\nmodels' outputs. A...,2024-08-07 08:21:05.493254,561dfdc7-ee61-481b-a498-fd7fa4063dff,,,0.0,3.578625,...,28.408481,0.000000,0.000000,9.006872,14.663198,2.536748,18.013744,0.000000,0.0,0.0
2,Eddington Ratios of Dust-obscured Quasars at $z \lesssim 1$: Evidence Supporting Dust-obscured Q...,https://arxiv.org/abs/2408.03324,2024-08-06 17:58:38+00:00,Dust-obscured quasars have been suspected as the intermediate stage galaxies\nbetween merger-dri...,2024-08-07 08:21:14.278502,e9023128-827c-4d9c-ba76-da9feb21dff1,,,0.0,3.137286,...,21.393402,0.000000,0.000000,9.532884,8.449990,0.000000,20.327536,0.000000,0.0,0.0
3,ClassiFIM: An Unsupervised Method To Detect Phase Transitions,https://arxiv.org/abs/2408.03323,2024-08-06 17:58:29+00:00,Estimation of the Fisher Information Metric (FIM-estimation) is an important\ntask that arises i...,2024-08-07 08:20:28.776427,9be96790-5807-43e4-b73a-e1df0658c96b,,,0.0,5.328259,...,34.898243,1.679940,3.636404,10.126708,14.376640,2.028563,18.838381,0.000000,0.0,0.0
4,Segment Anything in Medical Images and Videos: Benchmark and Deployment,https://arxiv.org/abs/2408.03322,2024-08-06 17:58:18+00:00,Recent advances in segmentation foundation models have enabled accurate and\nefficient segmentat...,2024-08-07 08:20:59.174877,b5ec1289-6e86-4eb8-9c70-2dc71a44dc91,,,0.0,5.189661,...,21.831580,1.311654,3.252804,11.012419,17.735854,2.848513,20.927818,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59310,EllieSQL: Cost-Efficient Text-to-SQL with Complexity-Aware Routing,http://arxiv.org/abs/2503.22402,2025-03-28 13:11:27+00:00,"Text-to-SQL automatically translates natural language queries to SQL,\nallowing non-technical us...",2025-04-04 07:26:40.807029,8c12c9fe-b7c0-4567-b7b1-7cd39112beda,https://arxiv.org/pdf/2503.22402.pdf,,0.0,4.737670,...,28.373850,0.000000,0.000000,4.513622,11.381848,0.000000,12.188824,0.000000,0.0,0.0
59311,Efficient low-carbon development in green hydrogen and ammonia economy: a case of Ukraine,http://arxiv.org/abs/2503.22326,2025-03-28 11:00:23+00:00,This paper focuses on assessing the potentials for the efficient low carbon\ndevelopment in gree...,2025-04-04 07:26:41.140017,9a5248f1-9c3f-47e8-a0f3-b6ed39cec880,https://arxiv.org/pdf/2503.22326.pdf,,0.0,4.796934,...,23.090824,0.000000,0.000000,10.414376,14.383021,0.000000,18.228047,0.000000,0.0,0.0
59312,CFiCS: Graph-Based Classification of Common Factors and Microcounseling Skills,http://arxiv.org/abs/2503.22277,2025-03-28 09:46:08+00:00,Common factors and microcounseling skills are critical to the effectiveness\nof psychotherapy. U...,2025-04-04 07:26:41.472498,8bd58f73-fd3e-4e9a-8c54-65634a61c5ab,https://arxiv.org/pdf/2503.22277.pdf,,0.0,4.641409,...,26.156941,1.134970,3.512175,10.594755,15.978533,2.464810,20.589683,0.000000,0.0,0.0
59313,Correlation-Attention Masked Temporal Transformer for User Identity Linkage Using Heterogeneous ...,http://arxiv.org/abs/2504.01979,2025-03-28 02:18:16+00:00,"With the rise of social media and Location-Based Social Networks (LBSN),\ncheck-in data across p...",2025-04-04 07:26:41.805947,e56cb8c7-6ad5-437e-8740-bbaea72b7795,https://arxiv.org/pdf/2504.01979.pdf,,0.0,5.698167,...,22.411117,7.041608,1.083748,16.860489,18.046048,4.353071,27.022623,3.483702,0.0,0.0


In [None]:
# sum all columns that start with bm25_
final_result['bm25_sum_final'] = final_result[[col for col in final_result.columns if col.startswith('bm25_')]].sum(axis=1)
final_result

# sort by bm25_score
pd.set_option('display.max_colwidth', 100)

final_result = final_result.sort_values(by='bm25_sum_final', ascending=False)
final_result

Unnamed: 0,title,abstract_url,published,abstract,tdw_timestamp,tdw_uuid,pdf_url,institution,bm25_sustaineable_tourism,bm25_building_objective_functions_in_deep_learning_efficiently_best_practices,...,bm25_mlops,bm25_ml_lifecycle,bm25_ethics_y,bm25_ml_code_quality_presentation_with_meghana,bm25_attribution,bm25_how_to_measure_clv_longer_term,bm25_media_mix_modelling_marketing,bm25_brazil_x,bm25_brazil_y,bm25_sum_final
4727,Multiple Approaches for Teaching Responsible Computing,http://arxiv.org/abs/2502.10856,2025-02-15 16:58:54+00:00,Teaching applied ethics in computer science has shifted from a perspective of\nteaching about pr...,2025-02-19 12:06:26.666363,ca73c006-f1fa-4328-b76c-cf621bc66fb5,https://arxiv.org/pdf/2502.10856.pdf,,0.0,12.021832,...,8.177457,2.374408,27.336064,22.122833,0.000000,37.921882,0.000000,0.0,0.0,688.475167
35,Marketing Mix Modeling in Lemonade,http://arxiv.org/abs/2501.01276,2025-01-02 14:17:31+00:00,Marketing mix modeling (MMM) is a widely used method to assess the\neffectiveness of marketing c...,2025-01-06 02:51:04.737379,ab5ddf31-7492-4d60-915e-ab04f0c207a3,https://arxiv.org/pdf/2501.01276.pdf,,0.0,6.441627,...,0.000000,1.515282,15.083942,11.452144,30.072213,22.974142,22.289754,0.0,0.0,686.702674
4270,Educating a Responsible AI Workforce: Piloting a Curricular Module on AI Policy in a Graduate Ma...,http://arxiv.org/abs/2502.07931,2025-02-11 20:16:56+00:00,As artificial intelligence (AI) technologies begin to permeate diverse\nfields-from healthcare t...,2025-02-17 06:43:03.617368,8ac6d3c4-d9ed-4f87-b971-1d2f33a2491c,https://arxiv.org/pdf/2502.07931.pdf,,0.0,5.268876,...,1.435283,9.582965,29.452450,17.770675,0.000000,37.154792,0.000000,0.0,0.0,656.712763
1535,Perceived Fairness of the Machine Learning Development Process: Concept Scale Development,http://arxiv.org/abs/2501.13421,2025-01-23 06:51:31+00:00,"In machine learning (ML) applications, unfairness is triggered due to bias in\nthe data, the dat...",2025-01-24 13:47:16.430483,f0250799-0800-47da-8d6d-2d2f3d934ac5,https://arxiv.org/pdf/2501.13421.pdf,,0.0,5.317497,...,1.401936,6.005793,24.564357,20.173553,0.000000,34.259103,0.000000,0.0,0.0,648.466977
1631,"Harnessing the Potential of Large Language Models in Modern Marketing Management: Applications, ...",http://arxiv.org/abs/2501.10685,2025-01-18 07:47:25+00:00,"Large Language Models (LLMs) have revolutionized the process of customer\nengagement, campaign o...",2025-01-24 13:47:42.969958,09389730-1c23-492b-9c5d-e3041c3ea258,https://arxiv.org/pdf/2501.10685.pdf,,0.0,12.237150,...,8.919750,0.000000,12.650253,11.867327,9.700678,26.180444,9.700678,0.0,0.0,644.681976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037,Synthetic Data and Health Privacy,http://arxiv.org/abs/2501.09031,2025-01-13 10:23:14+00:00,This Viewpoint discusses generative artificial intelligence and safeguarding\nprivacy by using s...,2025-01-19 15:50:16.263467,1231005b-04b0-4a5e-9165-5f44c5753b04,https://arxiv.org/pdf/2501.09031.pdf,,0.0,0.000000,...,0.000000,0.000000,7.749257,5.707270,0.000000,11.623885,0.000000,0.0,0.0,161.153628
4898,Narrow Bracketing and Risk in Games,http://arxiv.org/abs/2502.11243,2025-02-16 19:40:38+00:00,We study finite normal-form games under a narrow bracketing assumption: when\nplayers play sever...,2025-02-19 12:07:15.314285,9a0dc2cb-919a-40a3-b802-5d8af5e9b144,https://arxiv.org/pdf/2502.11243.pdf,,0.0,2.883898,...,0.000000,0.000000,5.767795,8.651693,0.000000,8.651693,0.000000,0.0,0.0,134.869869
3552,Cosmic Polarisation Rotation from CMB Data: a Review for GR110,http://arxiv.org/abs/2502.07743,2025-02-11 18:08:30+00:00,"We provide an update on the work of di Serego Alighieri (2015), focusing on\nrecent developments...",2025-02-12 22:27:37.734503,2c5c7a76-eac6-499c-a2e8-cdd6915f3775,https://arxiv.org/pdf/2502.07743.pdf,,0.0,0.000000,...,0.000000,0.000000,0.000000,1.402290,0.000000,0.000000,0.000000,0.0,0.0,128.797506
6007,Non-Bayesian Learning in Misspecified Models,http://arxiv.org/abs/2503.18024,2025-03-23 10:45:24+00:00,"Deviations from Bayesian updating are traditionally categorized as biases,\nerrors, or fallacies...",2025-03-29 18:07:41.252840,b5408bcd-2e44-45d1-96cc-0530ad213b66,https://arxiv.org/pdf/2503.18024.pdf,,0.0,5.288630,...,1.591814,1.591814,0.000000,8.263824,2.975194,0.000000,0.000000,0.0,0.0,120.655588


In [6]:
pd.set_option('display.max_colwidth', None)
final_result[["title", 'abstract_url', "bm25_sum_final"]].sort_values(by='bm25_sum_final', ascending=False).head(n=15)

Unnamed: 0,title,abstract_url,bm25_sum_final
4727,Multiple Approaches for Teaching Responsible Computing,http://arxiv.org/abs/2502.10856,688.475167
35,Marketing Mix Modeling in Lemonade,http://arxiv.org/abs/2501.01276,686.702674
4270,Educating a Responsible AI Workforce: Piloting a Curricular Module on AI Policy in a Graduate Machine Learning Course,http://arxiv.org/abs/2502.07931,656.712763
1535,Perceived Fairness of the Machine Learning Development Process: Concept Scale Development,http://arxiv.org/abs/2501.13421,648.466977
1631,"Harnessing the Potential of Large Language Models in Modern Marketing Management: Applications, Future Directions, and Strategic Recommendations",http://arxiv.org/abs/2501.10685,644.681976
1260,AI Driven Water Segmentation with deep learning models for Enhanced Flood Monitoring,http://arxiv.org/abs/2501.08266,632.258403
5018,Sheaf theory: from deep geometry to deep learning,http://arxiv.org/abs/2502.15476,626.792217
1586,Identifying and Mitigating Machine Learning Biases for the Gravitational-wave Detection Problem,http://arxiv.org/abs/2501.13846,623.39296
1415,Sample complexity of data-driven tuning of model hyperparameters in neural networks with structured parameter-dependent dual function,http://arxiv.org/abs/2501.13734,621.112808
6721,Statistically Testing Training Data for Unwanted Error Patterns using Rule-Oriented Regression,http://arxiv.org/abs/2503.18497,618.788783


In [7]:
# bottom
pd.set_option('display.max_colwidth', None)

final_result[["title", 'abstract_url', "bm25_sum_final"]].sort_values(by='bm25_sum_final', ascending=True).head(n=15)

Unnamed: 0,title,abstract_url,bm25_sum_final
2627,Graphs of unbounded linear cliquewidth must transduce all trees,http://arxiv.org/abs/2501.17556,98.562086
6007,Non-Bayesian Learning in Misspecified Models,http://arxiv.org/abs/2503.18024,120.655588
3552,Cosmic Polarisation Rotation from CMB Data: a Review for GR110,http://arxiv.org/abs/2502.07743,128.797506
4898,Narrow Bracketing and Risk in Games,http://arxiv.org/abs/2502.11243,134.869869
1037,Synthetic Data and Health Privacy,http://arxiv.org/abs/2501.09031,161.153628
554,ChatGPT's advice drives moral judgments with or without justification,http://arxiv.org/abs/2501.01897,202.305446
8655,Are Neutron Stars Rich in H-dibaryons?,http://arxiv.org/abs/2503.21171,204.403892
3005,Hydrodynamic attractor in periodically driven ultracold quantum gases,http://arxiv.org/abs/2501.19240,214.576822
384,Implementation of phase gates using single photons,http://arxiv.org/abs/2501.05230,221.412181
8555,Efficiency in the Roommates Problem,http://arxiv.org/abs/2502.16960,225.76394


In [13]:
from notebooks.deep_recommender.scores_table import ScoresTable
import datetime
import numpy as np

table = ScoresTable()

current_data: pd.DataFrame = table.read()
current_data.describe()


Unnamed: 0,bm25_sum_score
count,8922.0
mean,438.726109
std,55.653976
min,98.562086
25%,405.458016
50%,439.894341
75%,473.320088
max,688.475167


In [9]:

# convert all strings in bm25_sum_score to float or NAN
current_data['bm25_sum_score'] = current_data['bm25_sum_score'].apply(lambda x: float(x) if isinstance(x, str) and x.isdigit() else np.nan)

In [10]:
generated_date = datetime.datetime.now().strftime("%Y-%m-%d")


to_append = final_result[["title", 'abstract_url', "bm25_sum_final"]]
# rename title to paper_title
to_append.rename(columns={'title': 'paper_title'}, inplace=True)
# rename abstract_url to paper_url  
to_append.rename(columns={'abstract_url': 'paper_url'}, inplace=True)

# rename bm25_sum_final to bm25_sum_score
to_append.rename(columns={'bm25_sum_final': 'bm25_sum_score'}, inplace=True)

to_append['generated_date'] = generated_date
current_data = to_append


table.replace(current_data, dry_run=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_append.rename(columns={'title': 'paper_title'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_append.rename(columns={'abstract_url': 'paper_url'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_append.rename(columns={'bm25_sum_final': 'bm25_sum_score'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [11]:
output = table.read()
output.describe()

Unnamed: 0,bm25_sum_score
count,8922.0
mean,438.726109
std,55.653976
min,98.562086
25%,405.458016
50%,439.894341
75%,473.320088
max,688.475167
