In [1]:
import pandas as pd
import numpy as np

In [2]:
# Set used to keep track of any skipped decisions (placed here to not be reset without purpose)
skipped = set()

# 1. Import data

## 1.1 Scopus

In [3]:
scopus_df = pd.read_csv("Scopus.csv")
print(scopus_df.columns)

Index(['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year',
       'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end',
       'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations',
       'Authors with affiliations', 'Abstract', 'Author Keywords',
       'Index Keywords', 'Molecular Sequence Numbers', 'Chemicals/CAS',
       'Funding Texts', 'References', 'Editors', 'Publisher', 'Sponsors',
       'Conference name', 'Conference date', 'Conference location',
       'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID',
       'Language of Original Document', 'Document Type', 'Publication Stage',
       'Open Access', 'Source', 'EID'],
      dtype='object')


In [4]:
clean_scopus_df = scopus_df[["DOI", "Title", "Abstract", "Author Keywords"]].copy()
clean_scopus_df.columns = ["DOI", "Title", "Abstract", "Keywords"]
clean_scopus_df["ID"] = np.where(pd.isna(clean_scopus_df["DOI"]), scopus_df["EID"], clean_scopus_df["DOI"])
clean_scopus_df = clean_scopus_df.set_index("ID")
clean_scopus_df["Origin"] = "Scopus"
clean_scopus_df.index.name = "ID"
clean_scopus_df

Unnamed: 0_level_0,DOI,Title,Abstract,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10.1007/978-981-96-6291-3_3,10.1007/978-981-96-6291-3_3,Stock Price Prediction Using Univariate and Mu...,"In this study, we propose a hybrid approach th...",Large Language Models (LLM); Multivariate Anal...,Scopus
10.1016/j.eswa.2025.128676,10.1016/j.eswa.2025.128676,In the beginning was the Word: LLM-VaR and LLM-ES,"This study introduces LLM-VaR and LLM-ES, nove...",Expected shortfall; GPT; Large language models...,Scopus
10.1186/s40854-025-00789-6,10.1186/s40854-025-00789-6,The power of ChatGPT in processing text: Evide...,This study investigates the application of lar...,ChatGPT; Exchange rate; Interval; Sentiment an...,Scopus
10.1016/j.frl.2025.108489,10.1016/j.frl.2025.108489,Readability of financial reports and stock pri...,The rapid evolution of machine learning makes ...,BERT; Crash risk; Large language model; Readab...,Scopus
10.1007/s12525-025-00815-6,10.1007/s12525-025-00815-6,Wisdom of the crowd signals: Predictive power ...,The emergence of cryptocurrencies and decentra...,Collective intelligence; Cryptocurrencies; Pre...,Scopus
...,...,...,...,...,...
10.18653/v1/d15-1184,10.18653/v1/d15-1184,Reading documents for Bayesian Online Change P...,Modeling non-stationary time-series data for m...,,Scopus
10.1214/14-AOS1250,10.1214/14-AOS1250,Estimating time-changes in noisy Lévy models,"In quantitative finance, we often model asset ...",Itô semimartingale; Lévy process; Microstructu...,Scopus
10.1016/j.egypro.2014.02.176,10.1016/j.egypro.2014.02.176,Solar energy in urban environment: How urban d...,The paper is focused on a new solar urban plan...,Dynamic simulation; Solar access; Solar Potent...,Scopus
2-s2.0-84871531453,,A kernel-based technique for direction-of-chan...,This paper presents a generative approach to d...,Financial markets; Generative models; Model se...,Scopus


## 1.2 Web of Science

In [5]:
WoS_df = pd.read_csv("WoS.csv")
WoS_df

Unnamed: 0,PT,AU,BA,BE,GP,AF,BF,CA,TI,SO,...,WC,WE,SC,GA,PM,OA,HC,HP,DA,UT
0,C,"Gopal, A",,,ASSOC COMPUTING MACHINERY,"Gopal, Achintya",,,NeuralFactors: A Novel Factor Learning Approac...,5TH ACM INTERNATIONAL CONFERENCE ON AI IN FINA...,...,"Business, Finance; Computer Science, Interdisc...",Conference Proceedings Citation Index - Scienc...,Business & Economics; Computer Science,BY4IZ,,gold,,,2025-10-12,WOS:001443057200012
1,J,"Carvajal-Patiño, D; Ramos-Pollán, R",,,,"Carvajal-Patino, Daniel; Ramos-Pollan, Raul",,,Synthetic data generation with deep generative...,RESEARCH IN INTERNATIONAL BUSINESS AND FINANCE,...,"Business, Finance",Social Science Citation Index (SSCI),Business & Economics,5R8LB,,,,,2025-10-12,WOS:000874755600008
2,J,"Ul Haq, A; Zeb, A; Lei, ZF; Zhang, DF",,,,"Ul Haq, Anwar; Zeb, Adnan; Lei, Zhenfeng; Zhan...",,,Forecasting daily stock trend using multi-filt...,EXPERT SYSTEMS WITH APPLICATIONS,...,"Computer Science, Artificial Intelligence; Eng...",Science Citation Index Expanded (SCI-EXPANDED)...,Computer Science; Engineering; Operations Rese...,QB6ML,,,,,2025-10-12,WOS:000614253600001
3,C,"Wang, ZH",,,IEEE,Wang Zhuohan,,,TABNET WITH DATA AUGMENTATION APPORACH IN STOC...,2022 19TH INTERNATIONAL COMPUTER CONFERENCE ON...,...,"Computer Science, Information Systems; Compute...",Conference Proceedings Citation Index - Scienc...,Computer Science; Telecommunications,BU7AE,,,,,2025-10-12,WOS:000932922500101
4,J,"Tai, WX; Zhong, T; Mo, YH; Zhou, F",,,,"Tai, Wenxin; Zhong, Ting; Mo, Yuhua; Zhou, Fan",,,Learning Sentimental and Financial Signals Wit...,IEEE SIGNAL PROCESSING LETTERS,...,"Engineering, Electrical & Electronic",Science Citation Index Expanded (SCI-EXPANDED),Engineering,YP1EH,,,,,2025-10-12,WOS:000748371000034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,C,"Yuan, J; Zhang, Z",,,ACM,"Yuan, Jie; Zhang, Zhu",,,Connecting The Dots: Forecasting and Explainin...,FIRST ACM INTERNATIONAL CONFERENCE ON AI IN FI...,...,"Business, Finance; Computer Science, Artificia...",Conference Proceedings Citation Index - Scienc...,Business & Economics; Computer Science,BY5XH,,gold,,,2025-10-12,WOS:001467563200004
144,J,"Hoang, SD; Nguyen, THH; Dey, SK; Thu, HDT",,,,"Hoang, Sinh Duc; Nguyen, Tho Huu-Hoang; Dey, S...",,,Beyond the hype: AI advice and investor disson...,CURRENT PSYCHOLOGY,...,"Psychology, Multidisciplinary",Social Science Citation Index (SSCI),Psychology,1XZ5B,,,,,2025-10-12,WOS:001404832400001
145,J,"Ante, L; Saggu, A",,,,"Ante, Lennart; Saggu, Aman",,,Quantifying a firm's AI engagement: Constructi...,TECHNOLOGICAL FORECASTING AND SOCIAL CHANGE,...,Business; Regional & Urban Planning,Social Science Citation Index (SSCI),Business & Economics; Public Administration,T0N5U,,Green Submitted,,,2025-10-12,WOS:001402083300001
146,C,"Alizadeh, M; Asgari, Y; Samei, Z; Yari, S; Deh...",,"Aiello, LM; Chakraborty, T; Gaito, S",,"Alizadeh, Meysam; Asgari, Yasaman; Samei, Zeyn...",,,Exploring Relationships Between Cryptocurrency...,"SOCIAL NETWORKS ANALYSIS AND MINING, ASONAM 20...",...,"Computer Science, Artificial Intelligence; Com...",Conference Proceedings Citation Index - Scienc...,Computer Science; Communication,BY4PF,,,,,2025-10-12,WOS:001447238900028


In [6]:
clean_WoS_df = WoS_df[["UT", "TI", "AB", "DI", "DE"]].set_index("UT")
clean_WoS_df.columns = ["Title", "Abstract", "DOI", "Keywords"]
clean_WoS_df["Origin"] = "WoS"
clean_WoS_df.index.name = "ID"
clean_WoS_df

Unnamed: 0_level_0,Title,Abstract,DOI,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
WOS:001443057200012,NeuralFactors: A Novel Factor Learning Approac...,The use of machine learning for statistical mo...,10.1145/3677052.3698647,Stock Returns; Generative Modeling; Variationa...,WoS
WOS:000874755600008,Synthetic data generation with deep generative...,This work develops machine learning (ML) predi...,10.1016/j.ribaf.2022.101747,Trading strategies; Machine learning; Syntheti...,WoS
WOS:000614253600001,Forecasting daily stock trend using multi-filt...,Stock market forecasting has attracted signifi...,10.1016/j.eswa.2020.114444,Stock trend prediction; Feature selection; Dee...,WoS
WOS:000932922500101,TABNET WITH DATA AUGMENTATION APPORACH IN STOC...,"Despite the advent of deep learning, stock ret...",10.1109/ICCWAMTIP56608.2022.10016580,Stock Return Prediction; Deep Learning; Small ...,WoS
WOS:000748371000034,Learning Sentimental and Financial Signals Wit...,Stockmovement prediction using Tweets (text) a...,10.1109/LSP.2021.3135793,Predictive models; Time series analysis; Stoch...,WoS
...,...,...,...,...,...
WOS:001467563200004,Connecting The Dots: Forecasting and Explainin...,Market volatility prediction is of significant...,10.1145/3383455.3422518,market volatility; neural networks; forecastin...,WoS
WOS:001404832400001,Beyond the hype: AI advice and investor disson...,This study examines the impact of cognitive di...,10.1007/s12144-025-07430-w,Artificial intelligence; Cognitive dissonance;...,WoS
WOS:001402083300001,Quantifying a firm's AI engagement: Constructi...,"This paper proposes an objective, data-driven ...",10.1016/j.techfore.2024.123965,Artificial intelligence; Market efficiency; Na...,WoS
WOS:001447238900028,Exploring Relationships Between Cryptocurrency...,Academics increasingly acknowledge the predict...,10.1007/978-3-031-78541-2_28,Social Prediction; NLP; LLM; Prompt Engineerin...,WoS


## 1.3 IEEE

In [7]:
ieee_df = pd.read_csv("IEEE.csv")
ieee_df

Unnamed: 0,Document Title,Authors,Author Affiliations,Publication Title,Date Added To Xplore,Publication Year,Volume,Issue,Start Page,End Page,...,Mesh_Terms,Article Citation Count,Patent Citation Count,Reference Count,License,Online Date,Issue Date,Meeting Date,Publisher,Document Identifier
0,Large Language Models for Financial Aid in Fin...,M. K. Islam; A. Karmacharya; T. Sue; J. Fox,"Computer Science Department, University of Vir...",2024 IEEE International Conference on Big Data...,16 Jan 2025,2024,,,4892,4895,...,,1.0,,24.0,IEEE,16 Jan 2025,,,IEEE,IEEE Conferences
1,Estimating Value at Risk for Central Counterpa...,S. Udeshika Munasinghe; R. Rafeh; S. Rauchas,"Department of Computing Goldsmith, University ...",2024 International Conference on Data Science ...,5 Sep 2024,2024,,,305,310,...,,1.0,,16.0,IEEE,5 Sep 2024,,,IEEE,IEEE Conferences
2,A Comparative Study of Sequential Deep Learnin...,J. Fang; Z. Xiao; Y. Wu; J. Zhang; Z. Xu; Z. Mai,Independent Researcher; University of North Te...,2024 11th International Conference on Soft Com...,28 Jan 2025,2024,,,22,26,...,,1.0,,32.0,IEEE,28 Jan 2025,,,IEEE,IEEE Conferences
3,Stock Market Forecasting with Pretrained Deep ...,S. T. Do; A. Chu; Y. Zhao; Y. Li,Department of Computer and Information Science...,2025 IEEE 11th International Conference on Big...,1 Sep 2025,2025,,,142,146,...,,,,26.0,IEEE,1 Sep 2025,,,IEEE,IEEE Conferences
4,Enhanced Stock Market Trend Prediction on the ...,D. Kumar; P. Pramod Pawar; M. Kumar Meesala; P...,"Department of Information Technology, Universi...",2024 International Conference on Integrated In...,5 Feb 2025,2024,,,1,8,...,,1.0,,26.0,IEEE,5 Feb 2025,,,IEEE,IEEE Conferences
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,Intrinsic Graph Structure Estimation Using Gra...,A. Noda; H. Hino; M. Tatsuno; S. Akaho; N. Murata,"School of Science and Engineering, Waseda Univ...",Neural Computation,17 Jul 2014,2014,26.0,7.0,1455,1483,...,,1.0,,,,17 Jul 2014,,,MIT Press,MIT Press Journals
101,Solution Probing Attack Against Coin Mixing Ba...,Y. Mao; Z. Dang; H. Wang; Y. Zhang; S. Zhong,State Key Laboratory for Novel Software Techno...,IEEE Transactions on Dependable and Secure Com...,3 Sep 2024,2024,21.0,5.0,4684,4698,...,,2.0,,40.0,IEEE,18 Jan 2024,,,IEEE,IEEE Journals
102,"Artificial Intelligence Based Fraud Detection,...",K. Prabhu Rajasekar; D. Vezhaventhan,"School of Law, Specialization: Cyber Crime and...",2024 4th International Conference on Sustainab...,3 Dec 2024,2024,,,402,406,...,,,,21.0,IEEE,3 Dec 2024,,,IEEE,IEEE Conferences
103,LLMCatalyst: A Novel Incentive Mechanism for C...,R. Zeng; M. Zhao; J. Han; Y. Bi; X. Wang,"Software College, Northeastern University; Sof...",2025 IEEE/ACM 33rd International Symposium on ...,9 Sep 2025,2025,,,1,10,...,,,,33.0,IEEE,9 Sep 2025,,,IEEE,IEEE Conferences


In [8]:
clean_ieee_df = ieee_df[["DOI", "Document Title", "Abstract"]].copy()
clean_ieee_df.columns = ["DOI", "Title", "Abstract"]
clean_ieee_df["Keywords"] = ieee_df["IEEE Terms"].str.cat(ieee_df["Author Keywords"], sep=";")
clean_ieee_df["ID"] = ieee_df["PDF Link"]
clean_ieee_df = clean_ieee_df.set_index("ID")
clean_ieee_df["Origin"] = "IEEE"
clean_ieee_df.index.name = "ID"
clean_ieee_df


Unnamed: 0_level_0,DOI,Title,Abstract,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10824953,10.1109/BigData62323.2024.10824953,Large Language Models for Financial Aid in Fin...,Considering the difficulty of financial time s...,Runtime;Time series analysis;Memory management...,IEEE
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10652178,10.1109/ICoDSA62899.2024.10652178,Estimating Value at Risk for Central Counterpa...,Central counterparties (CCPs) play an importan...,Reactive power;Generative AI;Computational mod...,IEEE
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10851487,10.1109/ISCMI63661.2024.10851487,A Comparative Study of Sequential Deep Learnin...,Time series analysis of daily stock prices is ...,Measurement;Deep learning;Accuracy;Large langu...,IEEE
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=11129507,10.1109/BigDataService65758.2025.00026,Stock Market Forecasting with Pretrained Deep ...,"To perform stock market-related forecasting, w...",Deep learning;Analytical models;Sentiment anal...,IEEE
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10859808,10.1109/ICIICS63763.2024.10859808,Enhanced Stock Market Trend Prediction on the ...,It was difficult to find low-risk firms in 202...,Microorganisms;Spirals;Predictive models;Featu...,IEEE
...,...,...,...,...,...
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6855483,10.1162/NECO_a_00603,Intrinsic Graph Structure Estimation Using Gra...,A graph is a mathematical representation of a ...,,IEEE
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10404041,10.1109/TDSC.2024.3355453,Solution Probing Attack Against Coin Mixing Ba...,Conventional crowdsourcing platforms primarily...,Crowdsourcing;Task analysis;Blockchains;Encryp...,IEEE
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10763168,10.1109/ICSES63445.2024.10763168,"Artificial Intelligence Based Fraud Detection,...","Globally, fraud is increasing and has the pote...",Technological innovation;Consumer behavior;Ban...,IEEE
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=11143425,10.1109/IWQoS65803.2025.11143425,LLMCatalyst: A Novel Incentive Mechanism for C...,"Large foundation models, such as SORA and GPT-...",Training;Procurement;Costs;Foundation models;B...,IEEE


## 1.4 ProQuest

In [9]:
proquest_df = pd.read_csv("ProQuest.csv")
proquest_df

Unnamed: 0,Title,Abstract,StoreId,AccessionNumber,ArticleType,AuthorAffiliation,Authors,coden,companies,copyright,...,majorClassificationCodes,notes,startPage,subjectClassifications,subjectHeadings,subjectTerms,subjects,URL,FindACopy,Database
0,The power of ChatGPT in processing text: Evide...,This study investigates the application of lar...,3253199750,,Scholarly Journals,"Yunnan University of Finance and Economics, S...","Yang, Kun;Deng, Ruxin;Wei, Yunjie;Wang, Shouyang",,OpenAI,© The Author(s) 2025. This work is published ...,...,,"2025-09-08 (Registration) , 2024-07-22 (Recei...",118,,,"Language , Accuracy , Technological change , ...","KeywordsChatGPT , Sentiment analysis , Exchan...",,https://bibsys-almaprimo.hosted.exlibrisgroup....,"Coronavirus Research Database, Publicly Availa..."
1,Wisdom of the crowd signals: Predictive power ...,The emergence of cryptocurrencies and decentra...,3229330841,,Scholarly Journals,"University of Cologne, Cologne, Germany (GRID...","Haase, Frederic;Celig, Tom;Rath, Oliver;Schode...",,,© The Author(s) 2025. This work is published ...,...,52394 Portfolio Management and Investment Advice,"2025-06-25 (Registration) , 2025-01-19 (Recei...",64,52394 Portfolio Management and Investment Advice,,"Social networks , FINANCE , Decentralization ...","KeywordsSocial media signals , Cryptocurrenci...",,https://bibsys-almaprimo.hosted.exlibrisgroup....,"SciTech Premium Collection, Social Science Pre..."
2,Detecting Bitcoin Sentiment: Leveraging Langua...,As Bitcoin continues to establish itself as a ...,3241056936,,Scholarly Journals,"Seoul National University, Advanced Institute...","Jung, Hae Sun;Lee, Haein;Kim, Jang Hyun",,,© The Author(s) 2025. This work is published ...,...,,"2025-07-12 (Registration) , 2025-07-12 (Accep...",77,,,"Language , Accuracy , Dictionaries , Deep lea...","Large language models , Sentiment analysis , ...",,https://bibsys-almaprimo.hosted.exlibrisgroup....,SciTech Premium Collection
3,The <i>supply chain capitalism of AI</i>: a ca...,Artificial Intelligence (AI) is woven into a s...,3258102981,,Scholarly Journals,Oxford Internet Institute (University of Oxfo...,"Valdivia, Ana",,,© 2024 The Author(s). Published by Informa UK...,...,,"2024-04-04 (Received) , 2024-09-10 (Accepted)",2118,,,"Artificial intelligence , Algorithms , Supply...","Supply chain , AI , capitalism , infrastructu...",,https://bibsys-almaprimo.hosted.exlibrisgroup....,"SciTech Premium Collection, Social Science Pre..."
4,Modeling Asset Price Process: An Approach for ...,Artificial Intelligence (AI) models have been ...,3232240735,,Scholarly Journals,"Seoul National University, Department of Indu...","Park, Jinseong;Ko, Hyungjin;Lee, Jaewook",,,© The Author(s) 2024. This work is published ...,...,,"2024-06-19 (Registration) , 2024-06-18 (Accep...",349,,,"Artificial intelligence , Finance , SUSTAINAB...","KeywordsDeep learning , Generative diffusion ...",,https://bibsys-almaprimo.hosted.exlibrisgroup....,"SciTech Premium Collection, Social Science Pre..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,Bayesian Online Learning of the Hazard Rate in...,Change-point models are generative models of t...,754897843,13537051,Scholarly Journals,,"Wilson, Robert C;Nassar, Matthew R;Gold, Joshua I",,,,...,N3 11002 Computational & theoretical neurosci...,,2452,N3 11002 Computational & theoretical neurosci...,,"Learning , Mathematical models , Data process...",,,https://bibsys-almaprimo.hosted.exlibrisgroup....,SciTech Premium Collection
94,Bayesian online learning of the hazard rate in...,Change-point models are generative models of t...,748943350,20569174,Scholarly Journals,,"Wilson, Robert C;Nassar, Matthew R;Gold, Joshua I",,,,...,,"Print , Internet , Indexing method: Manual",2452,,"Algorithms , Humans , Learning , Bayes Theore...",Index Medicus,,,https://bibsys-almaprimo.hosted.exlibrisgroup....,SciTech Premium Collection
95,Bayesian Online Learning of the Hazard Rate in...,Change-point models are generative models of t...,742446133,,Scholarly Journals,,"Wilson, Robert C;Nassar, Matthew R;Gold, Joshua I",NEUCEB,,Copyright MIT Press Journals Sep 2010,...,,,2452,,,"Bayesian analysis , Online instruction , Dist...",,,https://bibsys-almaprimo.hosted.exlibrisgroup....,SciTech Premium Collection
96,The stock market reaction to Ernst & Young's s...,Purpose - The purpose of this paper is to inve...,32987886,200903-B7-0043220 (MB),Scholarly Journals,,"Liu, Carol;Nabar, Sandeep",,,,...,"B7 Management, Training, Regulations, Marketi...",,948,"B7 Management, Training, Regulations, Marketi...",,"Raw materials , Consulting , Mathematical mod...",Article,http://www.emeraldinsight.com/Insight/viewCon...,https://bibsys-almaprimo.hosted.exlibrisgroup....,SciTech Premium Collection


In [10]:
import numpy as np

clean_proquest_df = proquest_df[["digitalObjectIdentifier", "Title", "Abstract", "issn", "elecISSN", "StoreId"]]
clean_proquest_df.columns = ["DOI", "Title", "Abstract", "ISSN", "EISSN", "StoreId"]
clean_proquest_df["Keywords"] = proquest_df["subjectTerms"].str.cat(ieee_df["Author Keywords"], sep=";")

clean_proquest_df["ID"] = np.where(
    pd.notna(clean_proquest_df["DOI"]), clean_proquest_df["DOI"],
    np.where(
        pd.notna(clean_proquest_df["ISSN"]), clean_proquest_df["ISSN"],
        np.where(
            pd.notna(clean_proquest_df["EISSN"]), clean_proquest_df["EISSN"],
            clean_proquest_df["StoreId"]
        )
    )
)

clean_proquest_df= clean_proquest_df.drop(columns=["ISSN", "EISSN", "StoreId"])
clean_proquest_df = clean_proquest_df.set_index("ID")
clean_proquest_df["Origin"] = "Proquest"
clean_proquest_df.index.name = "ID"
clean_proquest_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_proquest_df["Keywords"] = proquest_df["subjectTerms"].str.cat(ieee_df["Author Keywords"], sep=";")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_proquest_df["ID"] = np.where(


Unnamed: 0_level_0,DOI,Title,Abstract,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10.1186/s40854-025-00789-6,10.1186/s40854-025-00789-6,The power of ChatGPT in processing text: Evide...,This study investigates the application of lar...,"Language , Accuracy , Technological change , ...",Proquest
10.1007/s12525-025-00815-6,10.1007/s12525-025-00815-6,Wisdom of the crowd signals: Predictive power ...,The emergence of cryptocurrencies and decentra...,"Social networks , FINANCE , Decentralization ...",Proquest
10.1007/s11063-025-11787-1,10.1007/s11063-025-11787-1,Detecting Bitcoin Sentiment: Leveraging Langua...,As Bitcoin continues to establish itself as a ...,"Language , Accuracy , Dictionaries , Deep lea...",Proquest
10.1080/1369118X.2024.2420021,10.1080/1369118X.2024.2420021,The <i>supply chain capitalism of AI</i>: a ca...,Artificial Intelligence (AI) is woven into a s...,"Artificial intelligence , Algorithms , Supply...",Proquest
10.1007/s10614-024-10668-4,10.1007/s10614-024-10668-4,Modeling Asset Price Process: An Approach for ...,Artificial Intelligence (AI) models have been ...,"Artificial intelligence , Finance , SUSTAINAB...",Proquest
...,...,...,...,...,...
10.1162/NECO_a_00007,10.1162/NECO_a_00007,Bayesian Online Learning of the Hazard Rate in...,Change-point models are generative models of t...,"Learning , Mathematical models , Data process...",Proquest
10.1162/NECO_a_00007,10.1162/NECO_a_00007,Bayesian online learning of the hazard rate in...,Change-point models are generative models of t...,Index Medicus;Gamification;Machine learning;S...,Proquest
0899-7667,,Bayesian Online Learning of the Hazard Rate in...,Change-point models are generative models of t...,"Bayesian analysis , Online instruction , Dist...",Proquest
10.1108/02686900610705037,10.1108/02686900610705037,The stock market reaction to Ernst & Young's s...,Purpose - The purpose of this paper is to inve...,"Raw materials , Consulting , Mathematical mod...",Proquest


# 2. Merge and Clean data

In [11]:
clean_df = pd.concat([clean_scopus_df, clean_WoS_df, clean_ieee_df, clean_proquest_df])
clean_df

Unnamed: 0_level_0,DOI,Title,Abstract,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10.1007/978-981-96-6291-3_3,10.1007/978-981-96-6291-3_3,Stock Price Prediction Using Univariate and Mu...,"In this study, we propose a hybrid approach th...",Large Language Models (LLM); Multivariate Anal...,Scopus
10.1016/j.eswa.2025.128676,10.1016/j.eswa.2025.128676,In the beginning was the Word: LLM-VaR and LLM-ES,"This study introduces LLM-VaR and LLM-ES, nove...",Expected shortfall; GPT; Large language models...,Scopus
10.1186/s40854-025-00789-6,10.1186/s40854-025-00789-6,The power of ChatGPT in processing text: Evide...,This study investigates the application of lar...,ChatGPT; Exchange rate; Interval; Sentiment an...,Scopus
10.1016/j.frl.2025.108489,10.1016/j.frl.2025.108489,Readability of financial reports and stock pri...,The rapid evolution of machine learning makes ...,BERT; Crash risk; Large language model; Readab...,Scopus
10.1007/s12525-025-00815-6,10.1007/s12525-025-00815-6,Wisdom of the crowd signals: Predictive power ...,The emergence of cryptocurrencies and decentra...,Collective intelligence; Cryptocurrencies; Pre...,Scopus
...,...,...,...,...,...
10.1162/NECO_a_00007,10.1162/NECO_a_00007,Bayesian Online Learning of the Hazard Rate in...,Change-point models are generative models of t...,"Learning , Mathematical models , Data process...",Proquest
10.1162/NECO_a_00007,10.1162/NECO_a_00007,Bayesian online learning of the hazard rate in...,Change-point models are generative models of t...,Index Medicus;Gamification;Machine learning;S...,Proquest
0899-7667,,Bayesian Online Learning of the Hazard Rate in...,Change-point models are generative models of t...,"Bayesian analysis , Online instruction , Dist...",Proquest
10.1108/02686900610705037,10.1108/02686900610705037,The stock market reaction to Ernst & Young's s...,Purpose - The purpose of this paper is to inve...,"Raw materials , Consulting , Mathematical mod...",Proquest


In [12]:
# Temporarily reset index
clean_df = clean_df.reset_index()

# Separate rows with NaN in 'DOI'
nan_rows = clean_df[clean_df['DOI'].isna()]

# Process rows where 'DOI' is not NaN
non_nan_rows = clean_df[clean_df['DOI'].notna()].copy()

# Normalize DOI
non_nan_rows["DOI"] = non_nan_rows["DOI"].str.lower()
non_nan_rows["DOI"] = non_nan_rows["DOI"].str.replace("https://doi.org/", "", regex=False)

# Drop duplicates based on DOI
non_nan_rows = non_nan_rows.drop_duplicates(subset=["DOI"], keep='first')

# Concatenate the untouched NaN rows back into the DataFrame
clean_df = pd.concat([non_nan_rows, nan_rows], ignore_index=True)

# Display the result
clean_df

Unnamed: 0,ID,DOI,Title,Abstract,Keywords,Origin
0,10.1007/978-981-96-6291-3_3,10.1007/978-981-96-6291-3_3,Stock Price Prediction Using Univariate and Mu...,"In this study, we propose a hybrid approach th...",Large Language Models (LLM); Multivariate Anal...,Scopus
1,10.1016/j.eswa.2025.128676,10.1016/j.eswa.2025.128676,In the beginning was the Word: LLM-VaR and LLM-ES,"This study introduces LLM-VaR and LLM-ES, nove...",Expected shortfall; GPT; Large language models...,Scopus
2,10.1186/s40854-025-00789-6,10.1186/s40854-025-00789-6,The power of ChatGPT in processing text: Evide...,This study investigates the application of lar...,ChatGPT; Exchange rate; Interval; Sentiment an...,Scopus
3,10.1016/j.frl.2025.108489,10.1016/j.frl.2025.108489,Readability of financial reports and stock pri...,The rapid evolution of machine learning makes ...,BERT; Crash risk; Large language model; Readab...,Scopus
4,10.1007/s12525-025-00815-6,10.1007/s12525-025-00815-6,Wisdom of the crowd signals: Predictive power ...,The emergence of cryptocurrencies and decentra...,Collective intelligence; Cryptocurrencies; Pre...,Scopus
...,...,...,...,...,...,...
317,https://ieeexplore.ieee.org/stamp/stamp.jsp?ar...,,Prediction of Foreign Exchange Rates by a Larg...,This paper proposes a prompt-based method util...,Training;Deep learning;Exchange rates;Costs;Pr...,IEEE
318,https://ieeexplore.ieee.org/stamp/stamp.jsp?ar...,,Applying Large Language Models to Issue Classi...,Effective prioritization of issue reports in s...,Training;Large language models;Focusing;Traini...,IEEE
319,3149090574,,Prediction of Foreign Exchange Rates by a Larg...,Conference Title: 2024 SICE Festival with Annu...,"Prompt engineering , Large language models , ...",Proquest
320,0090-5364,,Estimating time-changes in noisy Lévy models,"In quantitative finance, we often model asset ...","Probability distribution , SUSTAINABILITY , E...",Proquest


In [13]:
# Show title dupes
clean_df[clean_df["Title"].duplicated(keep=False)].sort_values("Title")

Unnamed: 0,ID,DOI,Title,Abstract,Keywords,Origin
321,0899-7667,,Bayesian Online Learning of the Hazard Rate in...,Change-point models are generative models of t...,"Bayesian analysis , Online instruction , Dist...",Proquest
242,WOS:000280563600008,10.1162/neco_a_00007,Bayesian Online Learning of the Hazard Rate in...,Change-point models are generative models of t...,,WoS
156,10.18653/v1/2024.findings-acl.233,10.18653/v1/2024.findings-acl.233,Can Large Language Models Mine Interpretable F...,Finding interpretable factors for stock return...,,Scopus
308,WOS:001356731804003,,Can Large Language Models Mine Interpretable F...,Finding interpretable factors for stock return...,,WoS
95,10.14569/IJACSA.2025.0160402,10.14569/ijacsa.2025.0160402,"Comparing Vision-Instruct LLMs, Vision-Based D...",This research conducts a comparative study of ...,Convolutional Neural Network (CNN); Large Lang...,Scopus
314,WOS:001503391700001,,"Comparing Vision-Instruct LLMs, Vision-Based D...",This research conducts a comparative study of ...,Convolutional Neural Network (CNN); Large Lan-...,WoS
234,10.1214/14-AOS1250,10.1214/14-aos1250,Estimating time-changes in noisy Lévy models,"In quantitative finance, we often model asset ...",Itô semimartingale; Lévy process; Microstructu...,Scopus
320,0090-5364,,Estimating time-changes in noisy Lévy models,"In quantitative finance, we often model asset ...","Probability distribution , SUSTAINABILITY , E...",Proquest
155,10.18653/v1/2024.findings-acl.185,10.18653/v1/2024.findings-acl.185,LLMFactor: Extracting Profitable Factors throu...,"Recently, Large Language Models (LLMs) have at...",,Scopus
309,WOS:001356731803018,,LLMFactor: Extracting Profitable Factors throu...,"Recently, Large Language Models (LLMs) have at...",,WoS


In [14]:
# Drop duplicates based on title
clean_df = clean_df.groupby(['Title'], as_index=False).first()
clean_df = clean_df.copy()
clean_df

Unnamed: 0,Title,ID,DOI,Abstract,Keywords,Origin
0,A Comparative Study of Sequential Deep Learnin...,10.1109/ISCMI63661.2024.10851487,10.1109/iscmi63661.2024.10851487,Time series analysis of daily stock prices is ...,Deep Learning; Finance; Large Language Model; ...,Scopus
1,A Deep Reinforcement Learning Approach for Por...,https://ieeexplore.ieee.org/stamp/stamp.jsp?ar...,10.1109/compsac65507.2025.00258,The Brazilian capital market presents unique c...,Training;Sentiment analysis;Soft sensors;Softw...,IEEE
2,A Financial Time Series Denoiser Based on Diff...,10.1145/3677052.3698649,10.1145/3677052.3698649,Financial time series often exhibit low signal...,Denoising; Diffusion Model; Financial Time Ser...,Scopus
3,A First Look at Financial Data Analysis Using ...,10.3390/jrfm18020099,10.3390/jrfm18020099,"OpenAI’s new flagship model, ChatGPT-4o, relea...",academia; artificial intelligence (AI); ChatGP...,Scopus
4,A Hybrid Approach on Conditional GAN for Portf...,10.1007/978-981-19-5845-8_61,10.1007/978-981-19-5845-8_61,"Over the decades, the Markowitz framework has ...",Autoencoding conditional GAN (ACGAN); Conditio...,Scopus
...,...,...,...,...,...,...
303,Trend-Heuristic Reinforcement Learning Framewo...,https://ieeexplore.ieee.org/stamp/stamp.jsp?ar...,10.1109/icassp48485.2024.10447993,Recent studies have shown that reinforcement l...,Training;Supervised learning;Reinforcement lea...,IEEE
304,Using Generative AI to predict the weather imp...,10.1007/s11156-025-01437-x,10.1007/s11156-025-01437-x,"This study explores the use of Generative AI, ...",ChatGPT; Generative AI; Large language model (...,Scopus
305,Using Large Language Models to Estimate Novel ...,10.3905/jpm.2025.1.710,10.3905/jpm.2025.1.710,This article presents an integrated framework ...,,Scopus
306,Utility of synthetic musculoskeletal gaits for...,10.1038/s41467-025-61292-1,10.1038/s41467-025-61292-1,Deep-neural-network-based artificial intellige...,,Scopus


In [15]:
# Set ID as index again
clean_df = clean_df.set_index("ID")
clean_df

Unnamed: 0_level_0,Title,DOI,Abstract,Keywords,Origin
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10.1109/ISCMI63661.2024.10851487,A Comparative Study of Sequential Deep Learnin...,10.1109/iscmi63661.2024.10851487,Time series analysis of daily stock prices is ...,Deep Learning; Finance; Large Language Model; ...,Scopus
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=11126558,A Deep Reinforcement Learning Approach for Por...,10.1109/compsac65507.2025.00258,The Brazilian capital market presents unique c...,Training;Sentiment analysis;Soft sensors;Softw...,IEEE
10.1145/3677052.3698649,A Financial Time Series Denoiser Based on Diff...,10.1145/3677052.3698649,Financial time series often exhibit low signal...,Denoising; Diffusion Model; Financial Time Ser...,Scopus
10.3390/jrfm18020099,A First Look at Financial Data Analysis Using ...,10.3390/jrfm18020099,"OpenAI’s new flagship model, ChatGPT-4o, relea...",academia; artificial intelligence (AI); ChatGP...,Scopus
10.1007/978-981-19-5845-8_61,A Hybrid Approach on Conditional GAN for Portf...,10.1007/978-981-19-5845-8_61,"Over the decades, the Markowitz framework has ...",Autoencoding conditional GAN (ACGAN); Conditio...,Scopus
...,...,...,...,...,...
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10447993,Trend-Heuristic Reinforcement Learning Framewo...,10.1109/icassp48485.2024.10447993,Recent studies have shown that reinforcement l...,Training;Supervised learning;Reinforcement lea...,IEEE
10.1007/s11156-025-01437-x,Using Generative AI to predict the weather imp...,10.1007/s11156-025-01437-x,"This study explores the use of Generative AI, ...",ChatGPT; Generative AI; Large language model (...,Scopus
10.3905/jpm.2025.1.710,Using Large Language Models to Estimate Novel ...,10.3905/jpm.2025.1.710,This article presents an integrated framework ...,,Scopus
10.1038/s41467-025-61292-1,Utility of synthetic musculoskeletal gaits for...,10.1038/s41467-025-61292-1,Deep-neural-network-based artificial intellige...,,Scopus


In [16]:
# Save initial screening as csv file
clean_df.to_csv("initial_screening.csv")

# 3. GPT Assessment

In [17]:
# # Merge GPT files
# import pandas as pd

# # Read the two CSV files
# df1 = pd.read_csv("gpt.csv")
# df2 = pd.read_csv("gpt13.csv")

# # Merge (concatenate) the two DataFrames
# merged_df = pd.concat([df1, df2], ignore_index=True)

# # Optionally, drop duplicates if needed
# # merged_df = merged_df.drop_duplicates()

# # Write the merged DataFrame to a new CSV file
# merged_df.to_csv("gpt.csv", index=False)

# duplicate_counts = merged_df["ID"].value_counts()
# num_duplicates = (duplicate_counts > 1).sum()

# print(f"Antall unike ID-er som forekommer mer enn én gang: {num_duplicates}")

In [18]:
# Load previous assessments
gpt_response_df = pd.read_csv("gpt.csv", index_col=0)
gpt_response_df

Unnamed: 0_level_0,Summary,LLM?,Input enhancement?,Prediction engine?,Post-prediction reasoning?,Financial instrument?,Instrument
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10.1109/ISCMI63661.2024.10851487,"Compares LSTM, Transformer, and LLMs for daily...",✔️,❌,✔️,❌,✔️,Stocks
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=11126558,DRL portfolio optimization for Brazilian asset...,✔️,✔️,❌,✔️,❌,
10.1145/3677052.3698649,Uses diffusion models to denoise financial tim...,❌,❌,❌,❌,✔️,Stocks
10.3390/jrfm18020099,Assesses ChatGPT-4o for financial data analysi...,❌,❌,❌,❌,❌,?
10.1007/978-981-19-5845-8_61,Presents HybridCGAN/HybridACGAN for portfolio ...,❌,❌,❌,❌,❌,
...,...,...,...,...,...,...,...
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10447993,TrendTrader uses an LLM for news sentiment and...,✔️,✔️,❌,✔️,❌,
10.1007/s11156-025-01437-x,ChatGPT assesses severe-weather text to foreca...,✔️,✔️,❌,❌,✔️,Stock
10.3905/jpm.2025.1.710,LLM-extracted ESG/geopolitical/supply-chain ri...,✔️,✔️,❌,❌,❌,?
10.1038/s41467-025-61292-1,Generates synthetic musculoskeletal gaits to p...,❌,❌,❌,❌,❌,?


In [19]:
# Generate prompts to Chat GPT
import os
from IPython.display import clear_output

batch_size = 50
i = 0

# Filter clean df on whether a gpt assessment has already been done
input_df = clean_df.drop(gpt_response_df.index, errors='ignore')[["Title", "Abstract"]]

for batch in range(0, len(input_df), batch_size):
    batch_df = input_df.iloc[batch:batch+batch_size]
    print("""
For the following CSV list of articles, please return a CSV with the following columns:
- "ID": The ID of the article, corresponding to the ID in the provided CSV.
- "Summary": A one-to-two sentence summary of the article
- "LLM?": A column with the value ✔️ if the article actively uses a Large Language Model (e.g., GPT, ChatGPT, Gemini) to improve prediction and ❌ otherwise
- "Input enhancement?": A column with the value ✔️ if an LLM is used pre-prediction for data/feature preparation (e.g., sentiment from text, event extraction, text-to-features) and ❌ otherwise
- "Prediction engine?": A column with the value ✔️ if an LLM itself is the predictive model for a financial target and ❌ otherwise
- "Post-prediction reasoning?": A column with the value ✔️ if an LLM is used after prediction for explanation or decision support (e.g., interpreting model outputs, scenario analysis) and ❌ otherwise
- "Financial instrument?": A column with the value ✔️ if the article contains a model that predicts the price of a financial instrument and ❌ otherwise
- "Instrument": A column with the type of the financial instrument that is predicted
You can also answer "?" to any question if it cannot be answered based on the title or abstract. Please ensure that the CSV is valid by applying quotation marks wherever necessary.
""")
    print(batch_df.to_csv(index=True))
    input("Press enter to get next prompt")
    clear_output(wait=True)

### Load Assessments

In [20]:
# Load all assessments
gpt_response_df = pd.read_csv("gpt.csv", index_col="ID")
gpt_response_df

Unnamed: 0_level_0,Summary,LLM?,Input enhancement?,Prediction engine?,Post-prediction reasoning?,Financial instrument?,Instrument
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10.1109/ISCMI63661.2024.10851487,"Compares LSTM, Transformer, and LLMs for daily...",✔️,❌,✔️,❌,✔️,Stocks
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=11126558,DRL portfolio optimization for Brazilian asset...,✔️,✔️,❌,✔️,❌,
10.1145/3677052.3698649,Uses diffusion models to denoise financial tim...,❌,❌,❌,❌,✔️,Stocks
10.3390/jrfm18020099,Assesses ChatGPT-4o for financial data analysi...,❌,❌,❌,❌,❌,?
10.1007/978-981-19-5845-8_61,Presents HybridCGAN/HybridACGAN for portfolio ...,❌,❌,❌,❌,❌,
...,...,...,...,...,...,...,...
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10447993,TrendTrader uses an LLM for news sentiment and...,✔️,✔️,❌,✔️,❌,
10.1007/s11156-025-01437-x,ChatGPT assesses severe-weather text to foreca...,✔️,✔️,❌,❌,✔️,Stock
10.3905/jpm.2025.1.710,LLM-extracted ESG/geopolitical/supply-chain ri...,✔️,✔️,❌,❌,❌,?
10.1038/s41467-025-61292-1,Generates synthetic musculoskeletal gaits to p...,❌,❌,❌,❌,❌,?


In [21]:
# Remove duplicate indices in gpt_response_df
gpt_response_df = gpt_response_df[~gpt_response_df.index.duplicated(keep='last')]

gpt_response_df

Unnamed: 0_level_0,Summary,LLM?,Input enhancement?,Prediction engine?,Post-prediction reasoning?,Financial instrument?,Instrument
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10.1109/ISCMI63661.2024.10851487,"Compares LSTM, Transformer, and LLMs for daily...",✔️,❌,✔️,❌,✔️,Stocks
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=11126558,DRL portfolio optimization for Brazilian asset...,✔️,✔️,❌,✔️,❌,
10.1145/3677052.3698649,Uses diffusion models to denoise financial tim...,❌,❌,❌,❌,✔️,Stocks
10.3390/jrfm18020099,Assesses ChatGPT-4o for financial data analysi...,❌,❌,❌,❌,❌,?
10.1007/978-981-19-5845-8_61,Presents HybridCGAN/HybridACGAN for portfolio ...,❌,❌,❌,❌,❌,
...,...,...,...,...,...,...,...
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10447993,TrendTrader uses an LLM for news sentiment and...,✔️,✔️,❌,✔️,❌,
10.1007/s11156-025-01437-x,ChatGPT assesses severe-weather text to foreca...,✔️,✔️,❌,❌,✔️,Stock
10.3905/jpm.2025.1.710,LLM-extracted ESG/geopolitical/supply-chain ri...,✔️,✔️,❌,❌,❌,?
10.1038/s41467-025-61292-1,Generates synthetic musculoskeletal gaits to p...,❌,❌,❌,❌,❌,?


In [22]:
processed_df = clean_df.join(gpt_response_df)

processed_df

Unnamed: 0_level_0,Title,DOI,Abstract,Keywords,Origin,Summary,LLM?,Input enhancement?,Prediction engine?,Post-prediction reasoning?,Financial instrument?,Instrument
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10.1109/ISCMI63661.2024.10851487,A Comparative Study of Sequential Deep Learnin...,10.1109/iscmi63661.2024.10851487,Time series analysis of daily stock prices is ...,Deep Learning; Finance; Large Language Model; ...,Scopus,"Compares LSTM, Transformer, and LLMs for daily...",✔️,❌,✔️,❌,✔️,Stocks
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=11126558,A Deep Reinforcement Learning Approach for Por...,10.1109/compsac65507.2025.00258,The Brazilian capital market presents unique c...,Training;Sentiment analysis;Soft sensors;Softw...,IEEE,DRL portfolio optimization for Brazilian asset...,✔️,✔️,❌,✔️,❌,
10.1145/3677052.3698649,A Financial Time Series Denoiser Based on Diff...,10.1145/3677052.3698649,Financial time series often exhibit low signal...,Denoising; Diffusion Model; Financial Time Ser...,Scopus,Uses diffusion models to denoise financial tim...,❌,❌,❌,❌,✔️,Stocks
10.3390/jrfm18020099,A First Look at Financial Data Analysis Using ...,10.3390/jrfm18020099,"OpenAI’s new flagship model, ChatGPT-4o, relea...",academia; artificial intelligence (AI); ChatGP...,Scopus,Assesses ChatGPT-4o for financial data analysi...,❌,❌,❌,❌,❌,?
10.1007/978-981-19-5845-8_61,A Hybrid Approach on Conditional GAN for Portf...,10.1007/978-981-19-5845-8_61,"Over the decades, the Markowitz framework has ...",Autoencoding conditional GAN (ACGAN); Conditio...,Scopus,Presents HybridCGAN/HybridACGAN for portfolio ...,❌,❌,❌,❌,❌,
...,...,...,...,...,...,...,...,...,...,...,...,...
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10447993,Trend-Heuristic Reinforcement Learning Framewo...,10.1109/icassp48485.2024.10447993,Recent studies have shown that reinforcement l...,Training;Supervised learning;Reinforcement lea...,IEEE,TrendTrader uses an LLM for news sentiment and...,✔️,✔️,❌,✔️,❌,
10.1007/s11156-025-01437-x,Using Generative AI to predict the weather imp...,10.1007/s11156-025-01437-x,"This study explores the use of Generative AI, ...",ChatGPT; Generative AI; Large language model (...,Scopus,ChatGPT assesses severe-weather text to foreca...,✔️,✔️,❌,❌,✔️,Stock
10.3905/jpm.2025.1.710,Using Large Language Models to Estimate Novel ...,10.3905/jpm.2025.1.710,This article presents an integrated framework ...,,Scopus,LLM-extracted ESG/geopolitical/supply-chain ri...,✔️,✔️,❌,❌,❌,?
10.1038/s41467-025-61292-1,Utility of synthetic musculoskeletal gaits for...,10.1038/s41467-025-61292-1,Deep-neural-network-based artificial intellige...,,Scopus,Generates synthetic musculoskeletal gaits to p...,❌,❌,❌,❌,❌,?


# 4. Make decisions on whether to include articles

In [23]:
# Load previous decisions
decisions_file_path = "decisions.csv"
decisions_df = pd.read_csv(decisions_file_path, index_col=0)
decisions_df.sort_values("Date", ascending=True)

Unnamed: 0_level_0,Decision,Date,Reviewer
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10.1007/978-981-19-5845-8_61,n,2025-10-13T15:15:17.528217,Sondre
https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7427326,n,2025-10-13T15:25:06.299154,Sondre
10.18653/v1/2023.emnlp-industry.69,y,2025-10-13T15:26:24.661211,Sondre
10.1007/s43546-021-00106-0,y,2025-10-13T15:28:54.306451,Sondre
10.1007/978-3-030-58790-1_7,y,2025-10-13T15:40:01.047035,Sondre
...,...,...,...
10.54364/AAIML.2025.52216,y,2025-10-15T12:59:41.494197,Sander
10.1109/BigData62323.2024.10824953,n,2025-10-15T13:01:18.437575,Sander
10.1111/exsy.70018,survey,2025-10-15T13:02:43.872432,Sander
10.1111/exsy.70018,survey,2025-10-15T13:02:43.872432,Sander


In [24]:
# Keep own version to mitigate overwriting each other
my_decisions_df = decisions_df.copy()

In [25]:
from datetime import datetime
from IPython.display import display, HTML, clear_output
import html
import numpy as np
import pandas as pd

reviewer = input("What is your name?")

# Shuffle the indices of the DataFrame to limit chance of two people reviewing the same article at the same time
np.random.seed(hash(reviewer) % 2**32)
shuffled_indices = np.random.permutation(processed_df.index)

def get_progress():
    decided = set(decisions_df.index).intersection(processed_df.index)
    return len(decided) / len(processed_df)

def _s(x):
    """Safe string: håndter NaN/None og escape HTML."""
    if pd.isna(x):
        return ""
    return html.escape(str(x))

def _badge(val, label):
    # Normaliser til standard symboler
    v = str(val).strip()
    yes = v in {"✔️", "✅", "True", "true", "Y", "y", "Yes", "yes", "1"}
    no  = v in {"❌", "✖️", "False", "false", "N", "n", "No", "no", "0"}
    if yes:
        return f'<span class="badge badge-yes">{html.escape(label)} ✔️</span>'
    if no:
        return f'<span class="badge badge-no">{html.escape(label)} ❌</span>'
    return f'<span class="badge badge-na">{html.escape(label)} ?</span>'

# --- LOOP ---
for index in shuffled_indices:
    row = processed_df.loc[index]
    if index in my_decisions_df.index:
        continue
    if index in skipped:
        continue
    if len(row.shape) > 1:
        row = row.iloc[0]
    if pd.isna(row.get("Summary", np.nan)):
        continue

    # Data (safed)
    title   = _s(row.get("Title", ""))
    doi     = _s(row.get("DOI", ""))
    summary = _s(row.get("Summary", ""))
    kw      = _s(row.get("Keywords", ""))
    abstract= _s(row.get("Abstract", ""))

    # Badges
    b_llm   = _badge(row.get("LLM?", ""), "LLM?")
    b_inp   = _badge(row.get("Input enhancement?", ""), "Input enhancement?")
    b_pred  = _badge(row.get("Prediction engine?", ""), "Prediction engine?")
    b_post  = _badge(row.get("Post-prediction reasoning?", ""), "Post-prediction reasoning?")
    b_fin   = _badge(row.get("Financial instrument?", ""), "Financial instrument?")
    instr   = _s(row.get("Instrument", "?"))

    # Link (vises bare hvis DOI finnes)
    doi_link = f'<a class="link" href="https://doi.org/{doi}" target="_blank">Link</a>' if doi else ""

    # Progress
    progress_pct = f"{get_progress() * 100:.2f}%"

    # Compact, two-column card layout
    css = """
    <style>
      .wrap { font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Arial, sans-serif; 
              font-size: 14px; color: #e5e5e5; line-height: 1.3; }
      .muted { color: #9aa0a6; }
      .title { font-size: 20px; margin: 0 0 6px 0; font-weight: 700; }
      .meta { font-size: 12px; }
      .grid { display: grid; grid-template-columns: 360px 1fr; gap: 12px; align-items: start; }
      @media (max-width: 1100px) { .grid { grid-template-columns: 1fr; } }
      .card { background: #1e1e1e; border: 1px solid #2a2a2a; border-radius: 10px; padding: 12px; }
      .row { display: flex; flex-wrap: wrap; gap: 6px; }
      .badge { display: inline-block; padding: 2px 8px; border-radius: 999px; font-size: 12px; border: 1px solid #333; }
      .badge-yes { background: #143d22; color: #8ef5b2; border-color: #1d5a33; }
      .badge-no  { background: #3a1616; color: #ff9aa2; border-color: #5a2222; }
      .badge-na  { background: #2c2c2c; color: #cfcfcf; border-color: #3a3a3a; }
      .k { margin-top: 4px; }
      .section h3 { margin: 8px 0 6px 0; font-size: 14px; }
      .section p { margin: 0; }
      .summary { font-size: 14px; }
      .abstract { font-size: 13px; line-height: 1.35; }
      details { background: #1b1b1b; border: 1px solid #2a2a2a; border-radius: 8px; padding: 8px; }
      details summary { cursor: pointer; font-weight: 600; margin: -8px -8px 8px -8px; padding: 8px; list-style: none; }
      details[open] summary { border-bottom: 1px solid #2a2a2a; }
      .link { color: #8ab4f8; text-decoration: none; }
      .link:hover { text-decoration: underline; }
      .progress { font-size: 12px; color: #9aa0a6; margin-bottom: 6px; }
      .mini { font-size: 12px; }
    </style>
    """

    html_block = f"""
    <div class="wrap">
      <div class="progress">Screening progress: {progress_pct}</div>
      <div class="title">{title}</div>
      <div class="meta">ID: {html.escape(str(index))} &nbsp;&nbsp; {doi_link}</div>
      <div style="height:6px;"></div>

      <div class="grid">
        <!-- LEFT: compact meta -->
        <div class="card">
          <div class="row">
            {b_llm} {b_inp} {b_pred} {b_post} {b_fin}
          </div>
          <div class="k mini"><strong>Instrument:</strong> {instr}</div>
        </div>

        <!-- RIGHT: content -->
        <div class="card">
          <div class="section">
            <h3>Summary</h3>
            <p class="summary">{summary}</p>
          </div>
          <div class="section">
            <details open>
              <summary>Keywords</summary>
              <p class="mini">{kw or "—"}</p>
            </details>
          </div>
          <div class="section">
            <details open>
              <summary>Abstract</summary>
              <p class="abstract">{abstract or "—"}</p>
            </details>
          </div>
        </div>
      </div>
    </div>
    """

    clear_output(wait=True)
    display(HTML(css + html_block))

    # --- beslutning ---
    decision = None
    while decision not in ["y", "n", "survey", "tja", "skip"]:
        decision = input("\nInclude this article? (y/n/survey/tja/skip):\n").lower()
    if decision == "skip":
        skipped.add(index)
        continue

    # Add decision
    my_decisions_df.loc[index] = [decision, datetime.now().isoformat(), reviewer]
    # Ensure latest version of file
    decisions_df = pd.read_csv(decisions_file_path, index_col=0)
    # Merge og dedupliser
    decisions_df = pd.concat([decisions_df, my_decisions_df])
    decisions_df = decisions_df.drop_duplicates(keep="last")
    decisions_df.to_csv(decisions_file_path)

# 5. Create Notion File

In [65]:
decisions_df = pd.read_csv("decisions.csv", index_col=0)
gpt_df = pd.read_csv("gpt.csv", index_col=0)
initial_screening_df = pd.read_csv("initial_screening.csv", index_col=0)

decisions_df = decisions_df[decisions_df["Decision"] != "n"]
merged_df = pd.merge(decisions_df, gpt_df, on="ID", how="inner")

merged_df["Survey?"] = merged_df["Decision"].apply(lambda x: "✔️" if x == "survey" else "❌")
merged_df = merged_df.drop(columns=["Decision", "Date", "Reviewer", "Summary", "LLM?", "Financial instrument?"])

merged_df = pd.merge(merged_df, initial_screening_df, on="ID", how="inner")
merged_df = merged_df.drop(columns=["Abstract", "Keywords", "Origin"])

merged_df["Link"] = merged_df["DOI"].apply(
    lambda doi: f"https://doi.org/{doi}" if pd.notna(doi) and doi != "" else ""
)

merged_df = merged_df[[
    "Title",
    "DOI",
    "Link",
    "Instrument",
    "Input enhancement?",
    "Prediction engine?",
    "Post-prediction reasoning?",
    "Survey?"
]]

import pandas as pd
import numpy as np

# Antall rader i DataFrame
n = len(merged_df)

# Reviewer-navn
reviewers = ["Sander", "Olav", "Sondre"]

# Beregn hvor mange artikler hver bør få (så likt som mulig)
base_count = n // len(reviewers)
remainder = n % len(reviewers)

# Lag en liste med like mange navn som rader
assignments = []
for i, name in enumerate(reviewers):
    count = base_count + (1 if i < remainder else 0)
    assignments.extend([name] * count)

# Bland listen tilfeldig
np.random.shuffle(assignments)

# Legg kolonnen til DataFrame
merged_df["Reviewer"] = assignments

print(merged_df["Reviewer"].value_counts())

merged_df.to_csv("Notion.csv", index=True)

merged_df

Reviewer
Sander    47
Olav      46
Sondre    46
Name: count, dtype: int64


Unnamed: 0_level_0,Title,DOI,Link,Instrument,Input enhancement?,Prediction engine?,Post-prediction reasoning?,Survey?,Reviewer
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10.18653/v1/2023.emnlp-industry.69,Harnessing LLMs for Temporal Data - A Study on...,10.18653/v1/2023.emnlp-industry.69,https://doi.org/10.18653/v1/2023.emnlp-industr...,Stocks,✔️,✔️,✔️,❌,Olav
10.1007/s43546-021-00106-0,Construction of a news article evaluation mode...,10.1007/s43546-021-00106-0,https://doi.org/10.1007/s43546-021-00106-0,Stocks,✔️,❌,❌,❌,Sondre
10.1007/978-3-030-58790-1_7,News Articles Evaluation Analysis in Automotiv...,10.1007/978-3-030-58790-1_7,https://doi.org/10.1007/978-3-030-58790-1_7,Stocks,✔️,❌,❌,❌,Sondre
10.1109/ICCA62237.2024.10927897,Assessing the Correlation Between News Sentime...,10.1109/icca62237.2024.10927897,https://doi.org/10.1109/icca62237.2024.10927897,?,❌,❌,❌,❌,Sondre
2-s2.0-85195171155,Modal-adaptive Knowledge-enhanced Graph-based ...,,,Multiple assets,✔️,✔️,❌,❌,Olav
...,...,...,...,...,...,...,...,...,...
10.1016/j.dss.2024.114362,Revisiting time-varying dynamics in stock mark...,10.1016/j.dss.2024.114362,https://doi.org/10.1016/j.dss.2024.114362,Stock,❌,❌,❌,❌,Sander
10.54364/AAIML.2025.52216,Assessing Lag-Llama in Probabilistic Time Seri...,10.54364/aaiml.2025.52216,https://doi.org/10.54364/aaiml.2025.52216,Stock,❌,❌,❌,❌,Sondre
10.1111/exsy.70018,"Generative AI for Finance: Applications, Case ...",10.1111/exsy.70018,https://doi.org/10.1111/exsy.70018,?,❌,❌,❌,✔️,Sander
10.1145/3677052.3698684,Transformers and attention-based networks in q...,10.1145/3677052.3698684,https://doi.org/10.1145/3677052.3698684,,❌,❌,❌,✔️,Sondre
