In [1]:
from bertopic import BERTopic
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('crossref_data.csv')

# Inspect the first few rows and columns
print(df.head())
print(df.columns)

                          DOI  \
0  10.1007/s43681-024-00521-7   
1  10.1007/s43681-023-00315-3   
2        10.2139/ssrn.4784555   
3  10.1007/s43681-024-00519-1   
4     10.1201/9781032654829-2   

                                               Title  \
0  Ethics and the use of generative AI in profess...   
1  Does the sun rise for ChatGPT? Scientific disc...   
2  The Ethics of Generative Ai in Social Science ...   
3  A powerful potion for a potent problem: transf...   
4  Generative Artificial Intelligence: Introducti...   

                                            Abstract  
0  <jats:title>Abstract</jats:title><jats:p>Gener...  
1                                                NaN  
2                                                NaN  
3  <jats:title>Abstract</jats:title><jats:p>Gener...  
4                                                NaN  
Index(['DOI', 'Title', 'Abstract'], dtype='object')


In [3]:
# Remove rows where 'Abstract' column has the value 'N/A'
df = df[~df['Abstract'].isin(['N/A']) & df['Abstract'].notna()]

In [4]:
df.head()

Unnamed: 0,DOI,Title,Abstract
0,10.1007/s43681-024-00521-7,Ethics and the use of generative AI in profess...,<jats:title>Abstract</jats:title><jats:p>Gener...
3,10.1007/s43681-024-00519-1,A powerful potion for a potent problem: transf...,<jats:title>Abstract</jats:title><jats:p>Gener...
5,10.4018/979-8-3693-8557-9.ch009,Generative AI for Cybersecurity,<jats:p>The intersection of cybersecurity and ...
10,10.4018/979-8-3693-8557-9.ch002,Generative AI,"<jats:p>For nearly 50 years, artificial intell..."
13,10.1007/s43681-024-00443-4,AI hype as a cyber security risk: the moral re...,<jats:title>Abstract</jats:title><jats:p>This ...


In [5]:
# Define the cleaning function
def clean_html_xml(text):
    # Create a BeautifulSoup object to parse the HTML/XML
    soup = BeautifulSoup(text, 'html.parser')
    # Extract and return text without HTML/XML tags
    return soup.get_text()

In [6]:
df['cleaned_text'] = df['Abstract'].apply(clean_html_xml)

In [7]:
# Initialize and fit the BERTopic model and remove stop words from the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)

In [8]:
topics, probs = topic_model.fit_transform(df['cleaned_text'])

In [9]:
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,34,-1_journals_computing_guidelines_genai,"[journals, computing, guidelines, genai, edgec...",[<p>As a specific category of artificial intel...
1,0,625,0_ai_generative_education_learning,"[ai, generative, education, learning, research...",[This study looks into how generative artifici...
2,1,561,1_ai_ethical_ethics_systems,"[ai, ethical, ethics, systems, human, moral, a...",[\nUNSTRUCTURED\nThe use of artificial intelli...
3,2,126,2_data_generative_medical_health,"[data, generative, medical, health, models, ai...","[In recent years, the rapid development of AI ..."
4,3,30,3_cybersecurity_security_cyber_ai,"[cybersecurity, security, cyber, ai, gans, thr...",[<p>The digital landscape of the modern world ...


In [14]:
topic_model.get_topic(2)

[('data', 0.0576349564120377),
 ('generative', 0.049741666231350706),
 ('medical', 0.0417035732372851),
 ('health', 0.03674487210025728),
 ('models', 0.03359216337900728),
 ('ai', 0.0332212441201029),
 ('healthcare', 0.03317070884725877),
 ('clinical', 0.029054173111054894),
 ('potential', 0.025050807798555248),
 ('care', 0.024171745464558816)]

In [12]:
topic_model.get_representative_docs(0)

["This study looks into how generative artificial intelligence (AI) is affecting the field of education. The study examines the possible advantages of using generative AI into instruction, such as more individualized learning opportunities, enhanced accessibility, and creative teaching strategies. The study also recognizes and examines the moral issues raised by the application of AI in education, highlighting the significance of upholding academic honesty and openness. It draws attention to the necessity of cautious application in order to guarantee that AI fosters fruitful learning results. The suggested research technique describes a mixed methods strategy that combines quantitative data analysis with qualitative information from surveys and interviews. The goal of the project is to provide best practices and standards for the appropriate integration of generative AI in education, while also identifying the potential and problems that this technology presents. The anticipated result