In [61]:
import pandas as pd
import numpy as np

In [62]:
# Set used to keep track of any skipped decisions (placed here to not be reset without purpose)
skipped = set()

# 1. Import data

## 1.1 Scopus

In [63]:
scopus_df = pd.read_csv("Scopus.csv")
print(scopus_df.columns)

Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end',
       'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author Keywords',
       'Index Keywords', 'Molecular Sequence Numbers', 'Chemicals/CAS',
       'Funding Texts', 'References', 'Editors', 'Publisher', 'Sponsors',
       'Conference name', 'Conference date', 'Conference location',
       'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID',
       'Language of Original Document', 'Document Type', 'Publication Stage',
       'Open Access', 'Source', 'EID'],
      dtype='object')


In [64]:
clean_scopus_df = scopus_df[["DOI", "Title", "Abstract", "Author Keywords"]].copy()
clean_scopus_df.columns = ["DOI", "Title", "Abstract", "Keywords"]
clean_scopus_df["ID"] = np.where(pd.isna(clean_scopus_df["DOI"]), scopus_df["EID"], clean_scopus_df["DOI"])
clean_scopus_df = clean_scopus_df.set_index("ID")
clean_scopus_df["Origin"] = "Scopus"
clean_scopus_df.index.name = "ID"
clean_scopus_df

Unnamed: 0_level_0,DOI,Title,Abstract,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10.1007/978-3-031-99353-4_33,10.1007/978-3-031-99353-4_33,Predicting Startup IPO: A Data-Driven Approach...,"The startup ecosystem is constantly evolving, ...",Api; Cosine Similarity; Explainable Ai; Featur...,Scopus
10.1007/978-981-96-6291-3_3,10.1007/978-981-96-6291-3_3,Stock Price Prediction Using Univariate and Mu...,"In this study, we propose a hybrid approach th...",Large Language Models (llm); Multivariate Anal...,Scopus
10.1016/j.renene.2025.124004,10.1016/j.renene.2025.124004,Probabilistic prediction of photovoltaic power...,Accurate weekly probabilistic forecasting of p...,Deepseekr1; Fine-tuning Adapter; Large Languag...,Scopus
10.1016/j.eswa.2025.128676,10.1016/j.eswa.2025.128676,In the beginning was the Word: LLM-VaR and LLM-ES,"This study introduces LLM-VaR and LLM-ES, nove...",Expected Shortfall; Gpt; Large Language Models...,Scopus
10.1038/s42003-025-07694-9,10.1038/s42003-025-07694-9,RNA-protein interaction prediction using netwo...,Accurate computational determination of RNA-pr...,"Rna; Rna; Rna, Viral; Rna-binding Proteins; Pr...",Scopus
...,...,...,...,...,...
10.32473/flairs.v35i.130668,10.32473/flairs.v35i.130668,A Comparison of House Price Classification wit...,Purchasing a home is one of the largest invest...,Economics; Houses; Text Processing; House's Pr...,Scopus
10.1007/s40888-021-00223-x,10.1007/s40888-021-00223-x,Robots and employment: evidence from Italy,Increased robot diffusion has raised concerns ...,Automation; Employment; Local Labour Markets; ...,Scopus
10.18653/v1/2021.semeval-1.87,10.18653/v1/2021.semeval-1.87,cs60075 team2 at SemEval-2021 Task 1: Lexical ...,This paper describes the performance of the te...,Correlation Methods; Semantics; Complex Datase...,Scopus
10.1016/j.chphi.2020.100005,10.1016/j.chphi.2020.100005,Theoretical prediction of decomposition temper...,With the approaching exhaustion of shallow-gro...,Conversion Temperature; Decomposition Temperat...,Scopus


## 1.2 Web of Science

In [65]:
WoS_df = pd.read_csv("WoS.txt", sep="\t", dtype=str, engine="python", encoding="utf-8")

WoS_df

Unnamed: 0,PT,AU,BA,BE,GP,AF,BF,CA,TI,SO,...,WC,WE,SC,GA,PM,OA,HC,HP,DA,UT
0,J,"Chen, R; Jiang, HQ; Guo, TY; Fan, CY",,,,"Chen, Rui; Jiang, Haiqi; Guo, Tingyu; Fan, Che...",,,Can Large Language Models forecast carbon pric...,RESEARCH IN INTERNATIONAL BUSINESS AND FINANCE,...,"Business, Finance",Social Science Citation Index (SSCI),Business & Economics,2TX3E,,,,,2025-10-02,WOS:001491215500001
1,J,"Zhou, Y",,,,"Zhou, Yi",,,Using Generative AI to predict the weather imp...,JOURNAL OF ALGEBRAIC COMBINATORICS,...,Mathematics,Science Citation Index Expanded (SCI-EXPANDED),Mathematics,5SR6T,,,,,2025-10-02,WOS:001528297000001
2,J,"Ma, F; Lyu, Z; Li, HB",,,,"Ma, Feng; Lyu, Zhichong; Li, Haibo",,,Can ChatGPT predict Chinese equity premiums?,FINANCE RESEARCH LETTERS,...,"Business, Finance",Social Science Citation Index (SSCI),Business & Economics,D2O6R,,,,,2025-10-02,WOS:001294636400001
3,J,"Xiao, F; Wang, XTXT",,,,"Xiao, Feng; Wang, X. T. XiaoTian",,,Evaluating the ability of large Language model...,SCIENTIFIC REPORTS,...,Multidisciplinary Sciences,Science Citation Index Expanded (SCI-EXPANDED),Science & Technology - Other Topics,7DY7U,40897780,gold,,,2025-10-02,WOS:001568749500021
4,C,"Lee, G; Yu, WC; Shin, KJ; Cheng, W; Chen, HF",,"Walsh, T; Shah, J; Kolter, Z",,"Lee, Geon; Yu, Wenchao; Shin, Kijung; Cheng, W...",,,"TimeCAP: Learning to Contextualize, Augment, a...",THIRTY-NINTH AAAI CONFERENCE ON ARTIFICIAL INT...,...,"Computer Science, Artificial Intelligence; Com...",Conference Proceedings Citation Index - Scienc...,Computer Science,BY6EV,,gold,,,2025-10-02,WOS:001477527200064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,J,"Kocon, J; Cichecki, I; Kaszyca, O; Kochanek, M...",,,,"Kocon, Jan; Cichecki, Igor; Kaszyca, Oliwier; ...",,,"ChatGPT: Jack of all trades, master of none",INFORMATION FUSION,...,"Computer Science, Artificial Intelligence; Com...",Science Citation Index Expanded (SCI-EXPANDED),Computer Science,L7RA5,,hybrid,,,2025-10-02,WOS:001025183100001
152,J,"Shi, J; Lee, M; Girish, VG; Xiao, GY; Lee, CK",,,,"Shi, Ji; Lee, Minwoo; Girish, V. G.; Xiao, Gua...",,,Embracing the ChatGPT revolution: unlocking ne...,JOURNAL OF HOSPITALITY AND TOURISM TECHNOLOGY,...,"Hospitality, Leisure, Sport & Tourism",Social Science Citation Index (SSCI),Social Sciences - Other Topics,QD9C8,,,,,2025-10-02,WOS:001198820600001
153,J,"Odabashian, R; Bastin, D; Jones, G; Manzoor, M...",,,,"Odabashian, Roupen; Bastin, Donald; Jones, Geo...",,,Assessment of ChatGPT-3.5's Knowledge in Oncol...,JMIR AI,...,Health Care Sciences & Services; Medical Infor...,Emerging Sources Citation Index (ESCI),Health Care Sciences & Services; Medical Infor...,P0H0I,38875575,"Green Submitted, gold",,,2025-10-02,WOS:001374817300001
154,J,"Guariso, D; Adewoyin, R; Aguilar, GR; Guerrero...",,,,"Guariso, Daniele; Adewoyin, Rilwan; Aguilar, G...",,,A generalized LLMs framework to support public...,ARTIFICIAL INTELLIGENCE IN MEDICINE,...,"Computer Science, Artificial Intelligence; Eng...",Science Citation Index Expanded (SCI-EXPANDED),Computer Science; Engineering; Medical Informa...,5GI1Z,40690805,,,,2025-10-02,WOS:001535014800001


In [66]:
clean_WoS_df = WoS_df[["UT", "TI", "AB", "DI", "DE"]].set_index("UT")
clean_WoS_df.columns = ["Title", "Abstract", "DOI", "Keywords"]
clean_scopus_df["ID"] = np.where(pd.isna(clean_scopus_df["DOI"]), scopus_df["EID"], clean_scopus_df["DOI"])
clean_scopus_df = clean_scopus_df.set_index("ID")
clean_WoS_df["Origin"] = "WoS"
clean_WoS_df.index.name = "ID"

clean_WoS_df

Unnamed: 0_level_0,Title,Abstract,DOI,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WOS:001491215500001,Can Large Language Models forecast carbon pric...,This paper investigates the impact of Large La...,10.1016/j.ribaf.2025.102951,Carbon Price Forecasting; Large Language Model...,WoS
WOS:001528297000001,Using Generative AI to predict the weather imp...,"This study explores the use of Generative AI, ...",10.1007/s11156-025-01437-x,Weather risk; Stock returns; Generative AI; Ch...,WoS
WOS:001294636400001,Can ChatGPT predict Chinese equity premiums?,"Leveraging over 1.86 million news headlines, w...",10.1016/j.frl.2024.105631,Large language model; ChatGPT; Chinese equity ...,WoS
WOS:001568749500021,Evaluating the ability of large Language model...,Recent advances in large language models (LLMs...,10.1038/s41598-025-17188-7,Generative AI; Social decision-making; Framing...,WoS
WOS:001477527200064,"TimeCAP: Learning to Contextualize, Augment, a...",Time series data is essential in various appli...,,,WoS
...,...,...,...,...,...
WOS:001025183100001,"ChatGPT: Jack of all trades, master of none",OpenAI has released the Chat Generative Pre-tr...,10.1016/j.inffus.2023.101861,ChatGPT; GPT-4; Natural language processing (N...,WoS
WOS:001198820600001,Embracing the ChatGPT revolution: unlocking ne...,Purpose - This study aims to investigate touri...,10.1108/JHTT-07-2023-0203,ChatGPT; Generative AI; Information; Perceived...,WoS
WOS:001374817300001,Assessment of ChatGPT-3.5's Knowledge in Oncol...,Background: ChatGPT (Open AI) is a state-of-th...,10.2196/50442,artificial intelligence; ChatGPT-3.5; language...,WoS
WOS:001535014800001,A generalized LLMs framework to support public...,"As a systemic problem, public health cannot be...",10.1016/j.artmed.2025.103203,Large Language Models; Public health financing...,WoS


## 1.3 IEEE

## 1.4 ProQuest

# 2. Merge and Clean data

In [67]:
clean_df = pd.concat([clean_scopus_df, clean_WoS_df])
clean_df

Unnamed: 0_level_0,DOI,Title,Abstract,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10.1007/978-3-031-99353-4_33,10.1007/978-3-031-99353-4_33,Predicting Startup IPO: A Data-Driven Approach...,"The startup ecosystem is constantly evolving, ...",Api; Cosine Similarity; Explainable Ai; Featur...,Scopus
10.1007/978-981-96-6291-3_3,10.1007/978-981-96-6291-3_3,Stock Price Prediction Using Univariate and Mu...,"In this study, we propose a hybrid approach th...",Large Language Models (llm); Multivariate Anal...,Scopus
10.1016/j.renene.2025.124004,10.1016/j.renene.2025.124004,Probabilistic prediction of photovoltaic power...,Accurate weekly probabilistic forecasting of p...,Deepseekr1; Fine-tuning Adapter; Large Languag...,Scopus
10.1016/j.eswa.2025.128676,10.1016/j.eswa.2025.128676,In the beginning was the Word: LLM-VaR and LLM-ES,"This study introduces LLM-VaR and LLM-ES, nove...",Expected Shortfall; Gpt; Large Language Models...,Scopus
10.1038/s42003-025-07694-9,10.1038/s42003-025-07694-9,RNA-protein interaction prediction using netwo...,Accurate computational determination of RNA-pr...,"Rna; Rna; Rna, Viral; Rna-binding Proteins; Pr...",Scopus
...,...,...,...,...,...
WOS:001025183100001,10.1016/j.inffus.2023.101861,"ChatGPT: Jack of all trades, master of none",OpenAI has released the Chat Generative Pre-tr...,ChatGPT; GPT-4; Natural language processing (N...,WoS
WOS:001198820600001,10.1108/JHTT-07-2023-0203,Embracing the ChatGPT revolution: unlocking ne...,Purpose - This study aims to investigate touri...,ChatGPT; Generative AI; Information; Perceived...,WoS
WOS:001374817300001,10.2196/50442,Assessment of ChatGPT-3.5's Knowledge in Oncol...,Background: ChatGPT (Open AI) is a state-of-th...,artificial intelligence; ChatGPT-3.5; language...,WoS
WOS:001535014800001,10.1016/j.artmed.2025.103203,A generalized LLMs framework to support public...,"As a systemic problem, public health cannot be...",Large Language Models; Public health financing...,WoS


In [68]:
# Temporarily reset index
clean_df = clean_df.reset_index()

# Separate rows with NaN in 'DOI'
nan_rows = clean_df[clean_df['DOI'].isna()]

# Process rows where 'DOI' is not NaN
non_nan_rows = clean_df[clean_df['DOI'].notna()].copy()

# Normalize DOI
non_nan_rows["DOI"] = non_nan_rows["DOI"].str.lower()
non_nan_rows["DOI"] = non_nan_rows["DOI"].str.replace("https://doi.org/", "", regex=False)

# Drop duplicates based on DOI
non_nan_rows = non_nan_rows.drop_duplicates(subset=["DOI"], keep='first')

# Concatenate the untouched NaN rows back into the DataFrame
clean_df = pd.concat([non_nan_rows, nan_rows], ignore_index=True)

# Display the result
clean_df

Unnamed: 0,ID,DOI,Title,Abstract,Keywords,Origin
0,10.1007/978-3-031-99353-4_33,10.1007/978-3-031-99353-4_33,Predicting Startup IPO: A Data-Driven Approach...,"The startup ecosystem is constantly evolving, ...",Api; Cosine Similarity; Explainable Ai; Featur...,Scopus
1,10.1007/978-981-96-6291-3_3,10.1007/978-981-96-6291-3_3,Stock Price Prediction Using Univariate and Mu...,"In this study, we propose a hybrid approach th...",Large Language Models (llm); Multivariate Anal...,Scopus
2,10.1016/j.renene.2025.124004,10.1016/j.renene.2025.124004,Probabilistic prediction of photovoltaic power...,Accurate weekly probabilistic forecasting of p...,Deepseekr1; Fine-tuning Adapter; Large Languag...,Scopus
3,10.1016/j.eswa.2025.128676,10.1016/j.eswa.2025.128676,In the beginning was the Word: LLM-VaR and LLM-ES,"This study introduces LLM-VaR and LLM-ES, nove...",Expected Shortfall; Gpt; Large Language Models...,Scopus
4,10.1038/s42003-025-07694-9,10.1038/s42003-025-07694-9,RNA-protein interaction prediction using netwo...,Accurate computational determination of RNA-pr...,"Rna; Rna; Rna, Viral; Rna-binding Proteins; Pr...",Scopus
...,...,...,...,...,...,...
265,WOS:000889371704042,,CEPOC: The Cambridge Exams Publishing Open Clo...,Open cloze tests are a standard type of exerci...,open cloze; blank-filling; language learning; ...,WoS
266,WOS:001181085100165,,Performance and Risk Trade-offs for Multi-word...,Large Language Models such as GPT-3 are well-s...,,WoS
267,WOS:001347142807059,,RisQNet: Rescuing SMEs from Financial Shocks w...,"In the face of economic downturns, Small and M...",,WoS
268,WOS:001371932507019,,R-U-SURE? Uncertainty-Aware Code Suggestions B...,Large language models show impressive results ...,,WoS


In [69]:
# Show title dupes
clean_df[clean_df["Title"].duplicated(keep=False)].sort_values("Title")

Unnamed: 0,ID,DOI,Title,Abstract,Keywords,Origin
225,2-s2.0-105007166117,,12th International Conference on HCI in Busine...,The proceedings contain 41 papers. The special...,,Scopus
226,2-s2.0-105007143427,,12th International Conference on HCI in Busine...,The proceedings contain 41 papers. The special...,,Scopus
216,2-s2.0-85212494941,,26th International Conference on Information I...,The proceedings contain 52 papers. The special...,,Scopus
217,2-s2.0-85212253172,,26th International Conference on Information I...,The proceedings contain 52 papers. The special...,,Scopus
224,2-s2.0-105007761709,,7th International Conference on Adaptive Instr...,The proceedings contain 40 papers. The special...,,Scopus
223,2-s2.0-105007848472,,7th International Conference on Adaptive Instr...,The proceedings contain 40 papers. The special...,,Scopus
169,10.18653/v1/2023.findings-emnlp.490,10.18653/v1/2023.findings-emnlp.490,A Comprehensive Evaluation of Large Language M...,Large language models (LLMs) have demonstrated...,Computational Linguistics; Domain Knowledge; C...,Scopus
257,WOS:001279591707033,,A Comprehensive Evaluation of Large Language M...,Large language models (LLMs) have demonstrated...,,WoS
265,WOS:000889371704042,,CEPOC: The Cambridge Exams Publishing Open Clo...,Open cloze tests are a standard type of exerci...,open cloze; blank-filling; language learning; ...,WoS
249,2-s2.0-85144347623,,CEPOC: The Cambridge Exams Publishing Open Clo...,Open cloze tests are a standard type of exerci...,Blank-filling; Cambridge Examinations; Languag...,Scopus


In [70]:
# Drop duplicates based on title
clean_df = clean_df.groupby(['Title'], as_index=False).first()
clean_df = clean_df.copy()
clean_df

Unnamed: 0,Title,ID,DOI,Abstract,Keywords,Origin
0,12th International Conference on HCI in Busine...,2-s2.0-105007166117,,The proceedings contain 41 papers. The special...,,Scopus
1,"14th Symposium on Languages, Applications and ...",2-s2.0-105013137151,,The proceedings contain 13 papers. The topics ...,,Scopus
2,18th International Conference on Information T...,2-s2.0-105009402610,,The proceedings contain 59 papers. The special...,,Scopus
3,2023 IEEE International Conference on Enabling...,2-s2.0-85190369483,,The proceedings contain 36 papers. The topics ...,,Scopus
4,2024 6th International Conference on Machine L...,2-s2.0-85216976361,,The proceedings contain 57 papers. The topics ...,,Scopus
...,...,...,...,...,...,...
246,Using Large Language Models to Estimate Novel ...,10.3905/jpm.2025.1.710,10.3905/jpm.2025.1.710,This article presents an integrated framework ...,,Scopus
247,Utilizing Text-Generative AI for Creating Oral...,10.1177/10534512241235896,10.1177/10534512241235896,Oral reading fluency probes are essential for ...,Academic Intervention; Artificial Intelligence...,Scopus
248,Zero-Shot Classification of Art with Large Lan...,10.1109/ACCESS.2025.3532995,10.1109/access.2025.3532995,Art has become an important new investment veh...,Art; Auction Price; Chatgpt; Classification; D...,Scopus
249,Zero-Shot Time Series Forecasting of the Onlin...,10.1109/ITC-CSCC66376.2025.11137629,10.1109/itc-cscc66376.2025.11137629,This study introduces a zero-shot forecasting ...,Online Gig Economy; Online Labor Index; Pre-tr...,Scopus


In [71]:
# Set ID as index again
clean_df = clean_df.set_index("ID")
clean_df

Unnamed: 0_level_0,Title,DOI,Abstract,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2-s2.0-105007166117,12th International Conference on HCI in Busine...,,The proceedings contain 41 papers. The special...,,Scopus
2-s2.0-105013137151,"14th Symposium on Languages, Applications and ...",,The proceedings contain 13 papers. The topics ...,,Scopus
2-s2.0-105009402610,18th International Conference on Information T...,,The proceedings contain 59 papers. The special...,,Scopus
2-s2.0-85190369483,2023 IEEE International Conference on Enabling...,,The proceedings contain 36 papers. The topics ...,,Scopus
2-s2.0-85216976361,2024 6th International Conference on Machine L...,,The proceedings contain 57 papers. The topics ...,,Scopus
...,...,...,...,...,...
10.3905/jpm.2025.1.710,Using Large Language Models to Estimate Novel ...,10.3905/jpm.2025.1.710,This article presents an integrated framework ...,,Scopus
10.1177/10534512241235896,Utilizing Text-Generative AI for Creating Oral...,10.1177/10534512241235896,Oral reading fluency probes are essential for ...,Academic Intervention; Artificial Intelligence...,Scopus
10.1109/ACCESS.2025.3532995,Zero-Shot Classification of Art with Large Lan...,10.1109/access.2025.3532995,Art has become an important new investment veh...,Art; Auction Price; Chatgpt; Classification; D...,Scopus
10.1109/ITC-CSCC66376.2025.11137629,Zero-Shot Time Series Forecasting of the Onlin...,10.1109/itc-cscc66376.2025.11137629,This study introduces a zero-shot forecasting ...,Online Gig Economy; Online Labor Index; Pre-tr...,Scopus


In [72]:
# Save initial screening as csv file
clean_df.to_csv("initial_screening.csv")

# 3. GPT Assessment

In [73]:
# Load previous assessments
gpt_response_df = pd.read_csv("gpt.csv", index_col=0)
gpt_response_df

Unnamed: 0_level_0,Summary,Financial instrument?,Instrument,AI?,LLM?
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2-s2.0-105007166117,The proceedings contain 41 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105013137151,The proceedings contain 13 papers. The topics ...,❌,?,❌,✔️
2-s2.0-105009402610,The proceedings contain 59 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-85190369483,The proceedings contain 36 papers. The topics ...,✔️,Cryptocurrency,✔️,✔️
2-s2.0-85216976361,The proceedings contain 57 papers. The topics ...,❌,?,✔️,✔️
2-s2.0-85201236297,The proceedings contain 98 papers. The topics ...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105004809261,The proceedings contain 11 papers. The topics ...,✔️,Stock/Equity,✔️,✔️
2-s2.0-85212494941,The proceedings contain 52 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105007848472,The proceedings contain 40 papers. The special...,❌,?,❌,✔️
2-s2.0-85176944442,The proceedings contain 28 papers. The special...,✔️,Cryptocurrency,✔️,✔️


In [74]:
# Generate prompts to Chat GPT
import os
from IPython.display import clear_output

batch_size = 50
i = 0

# Filter clean df on whether a gpt assessment has already been done
input_df = clean_df.drop(gpt_response_df.index, errors='ignore')[["Title", "Abstract"]]

for batch in range(0, len(input_df), batch_size):
    batch_df = input_df.iloc[batch:batch+batch_size]
    print("""
For the following CSV list of articles, please return a CSV with the following columns:
- `ID`: The ID of the article, corresponding to the ID in the provided CSV
- `Summary`: A one-to-two sentence summary of the article
- `Financial instrument?`: A column with the value ✔️ if the article contains a model that predicts the price of a financial instrument and ❌ otherwise
- `Instrument`: A column with the type of the financial instrument that is predicted
- `AI?`: A column with the value ✔️ if the model in the article is an AI/ML model (i.e. more advanced than traditional econometric models) and ❌ otherwise
- `Probabilistic?`: A column with the value ✔️ if the model is a **probablistic** AI model (i.e. the prediction includes either variance or a distribution or some other financial risk measure such as VaR) and ❌ otherwise
You can also answer "?" to any question if it cannot be answered based on the title or abstract.
Please ensure that the CSV is valid by applying quotation marks wherever necessary.
""")
    print(batch_df.to_csv(index=True))
    input("Press enter to get next prompt")
    clear_output(wait=True)


For the following CSV list of articles, please return a CSV with the following columns:
- `ID`: The ID of the article, corresponding to the ID in the provided CSV
- `Summary`: A one-to-two sentence summary of the article
- `Financial instrument?`: A column with the value ✔️ if the article contains a model that predicts the price of a financial instrument and ❌ otherwise
- `Instrument`: A column with the type of the financial instrument that is predicted
- `AI?`: A column with the value ✔️ if the model in the article is an AI/ML model (i.e. more advanced than traditional econometric models) and ❌ otherwise
- `Probabilistic?`: A column with the value ✔️ if the model is a **probablistic** AI model (i.e. the prediction includes either variance or a distribution or some other financial risk measure such as VaR) and ❌ otherwise
You can also answer "?" to any question if it cannot be answered based on the title or abstract.
Please ensure that the CSV is valid by applying quotation marks wher

### Load Assessments

In [75]:
# Load all assessments
gpt_response_df = pd.read_csv("gpt.csv", index_col="ID")
gpt_response_df

Unnamed: 0_level_0,Summary,Financial instrument?,Instrument,AI?,LLM?
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2-s2.0-105007166117,The proceedings contain 41 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105013137151,The proceedings contain 13 papers. The topics ...,❌,?,❌,✔️
2-s2.0-105009402610,The proceedings contain 59 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-85190369483,The proceedings contain 36 papers. The topics ...,✔️,Cryptocurrency,✔️,✔️
2-s2.0-85216976361,The proceedings contain 57 papers. The topics ...,❌,?,✔️,✔️
2-s2.0-85201236297,The proceedings contain 98 papers. The topics ...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105004809261,The proceedings contain 11 papers. The topics ...,✔️,Stock/Equity,✔️,✔️
2-s2.0-85212494941,The proceedings contain 52 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105007848472,The proceedings contain 40 papers. The special...,❌,?,❌,✔️
2-s2.0-85176944442,The proceedings contain 28 papers. The special...,✔️,Cryptocurrency,✔️,✔️


In [76]:
# Remove duplicate indices in gpt_response_df
gpt_response_df = gpt_response_df[~gpt_response_df.index.duplicated(keep='last')]
gpt_response_df

Unnamed: 0_level_0,Summary,Financial instrument?,Instrument,AI?,LLM?
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2-s2.0-105007166117,The proceedings contain 41 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105013137151,The proceedings contain 13 papers. The topics ...,❌,?,❌,✔️
2-s2.0-105009402610,The proceedings contain 59 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-85190369483,The proceedings contain 36 papers. The topics ...,✔️,Cryptocurrency,✔️,✔️
2-s2.0-85216976361,The proceedings contain 57 papers. The topics ...,❌,?,✔️,✔️
2-s2.0-85201236297,The proceedings contain 98 papers. The topics ...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105004809261,The proceedings contain 11 papers. The topics ...,✔️,Stock/Equity,✔️,✔️
2-s2.0-85212494941,The proceedings contain 52 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105007848472,The proceedings contain 40 papers. The special...,❌,?,❌,✔️
2-s2.0-85176944442,The proceedings contain 28 papers. The special...,✔️,Cryptocurrency,✔️,✔️


In [77]:
processed_df = clean_df.join(gpt_response_df)
processed_df

Unnamed: 0_level_0,Title,DOI,Abstract,Keywords,Origin,Summary,Financial instrument?,Instrument,AI?,LLM?
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2-s2.0-105007166117,12th International Conference on HCI in Busine...,,The proceedings contain 41 papers. The special...,,Scopus,The proceedings contain 41 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-105013137151,"14th Symposium on Languages, Applications and ...",,The proceedings contain 13 papers. The topics ...,,Scopus,The proceedings contain 13 papers. The topics ...,❌,?,❌,✔️
2-s2.0-105009402610,18th International Conference on Information T...,,The proceedings contain 59 papers. The special...,,Scopus,The proceedings contain 59 papers. The special...,✔️,Stock/Equity,✔️,✔️
2-s2.0-85190369483,2023 IEEE International Conference on Enabling...,,The proceedings contain 36 papers. The topics ...,,Scopus,The proceedings contain 36 papers. The topics ...,✔️,Cryptocurrency,✔️,✔️
2-s2.0-85216976361,2024 6th International Conference on Machine L...,,The proceedings contain 57 papers. The topics ...,,Scopus,The proceedings contain 57 papers. The topics ...,❌,?,✔️,✔️
...,...,...,...,...,...,...,...,...,...,...
10.3905/jpm.2025.1.710,Using Large Language Models to Estimate Novel ...,10.3905/jpm.2025.1.710,This article presents an integrated framework ...,,Scopus,,,,,
10.1177/10534512241235896,Utilizing Text-Generative AI for Creating Oral...,10.1177/10534512241235896,Oral reading fluency probes are essential for ...,Academic Intervention; Artificial Intelligence...,Scopus,,,,,
10.1109/ACCESS.2025.3532995,Zero-Shot Classification of Art with Large Lan...,10.1109/access.2025.3532995,Art has become an important new investment veh...,Art; Auction Price; Chatgpt; Classification; D...,Scopus,,,,,
10.1109/ITC-CSCC66376.2025.11137629,Zero-Shot Time Series Forecasting of the Onlin...,10.1109/itc-cscc66376.2025.11137629,This study introduces a zero-shot forecasting ...,Online Gig Economy; Online Labor Index; Pre-tr...,Scopus,,,,,


# 4. Make decisions on whether to include articles

In [79]:
# Load previous decisions
decisions_file_path = "decisions.csv"
decisions_df = pd.read_csv(decisions_file_path, index_col=0)
decisions_df.sort_values("Date", ascending=True)

Unnamed: 0_level_0,Decision,Date,Reviewer
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [80]:
# Keep own version to mitigate overwriting each other
my_decisions_df = decisions_df.copy()

In [82]:
from datetime import datetime
from IPython.display import display, HTML, clear_output

reviewer = input("What is your name?")

# Shuffle the indices of the DataFrame to limit chance of two people reviewing the same article at the same time
np.random.seed(hash(reviewer) % 2**32)
shuffled_indices = np.random.permutation(processed_df.index)

def get_progress():
    decided = set(decisions_df.index).intersection(processed_df.index)
    return len(decided) / len(processed_df)

# Loop through the randomized indices
for index in shuffled_indices:
    row = processed_df.loc[index]
    if index in my_decisions_df.index:
        continue
    if index in skipped:
        continue
    if len(row.shape) > 1:
        row = row.iloc[0]
    if pd.isna(row["Summary"]):
        continue
    clear_output(wait=True)
    print(f"Screening progress: {get_progress() * 100:.2f}%")
    display(HTML(f'<h1>{row["Title"]}</h1>'))
    print("ID:", index)
    display(HTML(f'<a href="https://doi.org/{row["DOI"]}" target="_blank">Link</a>'))
    display(HTML(f'<h2>GPT assessment</h2>'))
    display(HTML(f'<p style="font-size:20px;">Financial instrument?: {row["Financial instrument?"]}</p>'))
    display(HTML(f'<p style="font-size:20px;">AI?: {row["AI?"]}</p>'))
    display(HTML(f'<p style="font-size:20px;">LLM?: {row["LLM?"]}</p>'))
    display(HTML(f'<p style="font-size:20px;">Instrument: {row["Instrument"]}</p>'))
    display(HTML(f'<h2>Summary</h2>'))
    display(HTML(f'<p style="font-size:18px;line-height:30px">{row["Summary"]}</p>'))
    display(HTML(f'<h2>Keywords</h2>'))
    display(HTML(f'<p style="font-size:18px;line-height:30px">{row["Keywords"]}</p>'))
    display(HTML(f'<h2>Abstract</h2>'))
    display(HTML(f'<p style="font-size:16px;line-height:24px">{row["Abstract"]}</p>'))
    decision = None
    while decision not in ["y", "n", "survey", "tja", "skip"]:
        decision = input("\nInclude this article? (y/n/survey/tja/skip):\n").lower()
    if decision == "skip":
        skipped.add(index)
        continue
    # Add decision
    my_decisions_df.loc[index] = [decision, datetime.now().isoformat(), reviewer]
    # Ensure latest version of file
    # mount()
    decisions_df = pd.read_csv(decisions_file_path, index_col=0)
    # Add any entries in my_decisions_df that are not in decisions_df
    decisions_df = pd.concat([decisions_df, my_decisions_df])
    # Remove rows that are strictly equal
    decisions_df = decisions_df.drop_duplicates(keep="last")
    # Save
    decisions_df.to_csv(decisions_file_path)

Screening progress: 0.40%


ID: 10.3390/jrfm18020099


KeyboardInterrupt: Interrupted by user