In [1]:
from bertopic import BERTopic
from bs4 import BeautifulSoup
import pandas as pd

In [27]:
# Load the CSV file into a DataFrame
df = pd.read_csv('6n_cleaned.csv', error_bad_lines=False)

# Inspect the first few rows and columns
print(df.head())
print(df.columns)

                                    DOI  \
0             10.47363/jaicc/2024(3)347   
1        10.70715/jitcai.2024.v1.i1.004   
2        10.7256/2454-0625.2024.6.70926   
3  10.51191/issn.2637-1898.2024.7.12.12   
4            10.1007/s43681-022-00176-2   

                                               Title  \
0  Impact of AI and GenAI on Healthcare: Security...   
1  The Impact of GenAI on Student Engagement and ...   
2  Socio-cultural risks of multimodal large gener...   
3  AI: Duality in Applications of GenAI and Assis...   
4    Review of the state of the art in autonomous AI   

                                            Abstract  
0  <jats:p>This paper aims to understand the impa...  
1  <jats:p>The rapid adoption of AI (AI) in highe...  
2  <jats:p>\n The article is devoted to the study...  
3  <jats:p>This paper explores the multifaceted r...  
4  <jats:title>Abstract</jats:title><jats:p>This ...  
Index(['DOI', 'Title', 'Abstract'], dtype='object')


In [28]:
df.head()

Unnamed: 0,DOI,Title,Abstract
0,10.47363/jaicc/2024(3)347,Impact of AI and GenAI on Healthcare: Security...,<jats:p>This paper aims to understand the impa...
1,10.70715/jitcai.2024.v1.i1.004,The Impact of GenAI on Student Engagement and ...,<jats:p>The rapid adoption of AI (AI) in highe...
2,10.7256/2454-0625.2024.6.70926,Socio-cultural risks of multimodal large gener...,<jats:p>\n The article is devoted to the study...
3,10.51191/issn.2637-1898.2024.7.12.12,AI: Duality in Applications of GenAI and Assis...,<jats:p>This paper explores the multifaceted r...
4,10.1007/s43681-022-00176-2,Review of the state of the art in autonomous AI,<jats:title>Abstract</jats:title><jats:p>This ...


In [29]:
# Define the cleaning function
def clean_html_xml(text):
    # Create a BeautifulSoup object to parse the HTML/XML
    soup = BeautifulSoup(text, 'html.parser')
    # Extract and return text without HTML/XML tags
    return soup.get_text()

In [30]:
df['cleaned_text'] = df['Abstract'].apply(clean_html_xml)

In [20]:
# Initialize and fit the BERTopic model and remove stop words from the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)

In [33]:
topics, probs = topic_model.fit_transform(df['cleaned_text'])

In [34]:
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,117,-1_ai_genai_research_ethical,"[ai, genai, research, ethical, data, impact, u...","[AI encompasses a wide range of approaches, me..."
1,0,117,0_ai_ethical_ethics_moral,"[ai, ethical, ethics, moral, human, developmen...","[Aim: AI systems can be complex and opaque, ma..."
2,1,55,1_students_genai_learning_education,"[students, genai, learning, education, ai, edu...",[Integrating GenAI into education has sparked ...
3,2,32,2_art_ai_content_music,"[art, ai, content, music, creative, new, aigen...",[With the rapid development of information soc...
4,3,29,3_business_marketing_genai_customer,"[business, marketing, genai, customer, ai, stu...",[The research paper investigates the comparati...
5,4,26,4_healthcare_medical_ai_care,"[healthcare, medical, ai, care, patient, data,...",[AI (AI) is revolutionizing the healthcare sec...
6,5,17,5_gan_data_network_adversarial,"[gan, data, network, adversarial, generative, ...","[<p><span lang=""EN-US"">Research in the field o..."
7,6,15,6_genai_text_extraction_language,"[genai, text, extraction, language, models, ne...",[This study is an in-depth exploration of the ...
8,7,15,7_ethical_healthcare_ai_principles,"[ethical, healthcare, ai, principles, issues, ...",[Public and private investments into developin...
9,8,11,8_security_cybersecurity_cyber_threats,"[security, cybersecurity, cyber, threats, ai, ...",[AbstractThis paper examines the ethical oblig...


In [35]:
topic_model.get_topic(1)

[('students', 0.06411743498806728),
 ('genai', 0.054761840667846874),
 ('learning', 0.054469330898218815),
 ('education', 0.053691835803015696),
 ('ai', 0.04513373289321975),
 ('educational', 0.03990377553195003),
 ('higher', 0.028553010893517274),
 ('tools', 0.026271776235088657),
 ('educators', 0.025865146475131127),
 ('academic', 0.02520832997677343)]

In [36]:
topic_model.get_representative_docs(6)

['This study is an in-depth exploration of the nascent field of Natural Language Processing (NLP) and GenAI (AI), and it concentrates on the vital task of distinguishing between human-generated text and content that has been produced by AI models. Particularly, this research pioneers the identification of financial text derived from AI models such as GenAI and paraphrasing tools like QuillBot. While our primary focus is on financial content, we have also pinpointed texts generated by paragraph rewriting tools and utilized GenAI for various contexts this multiclass identification was missing in previous studies. In this paper, we use a comprehensive feature extraction methodology that combines TF–IDF with Word2Vec, along with individual feature extraction methods. Importantly, combining a Random Forest model with Word2Vec results in impressive outcomes. Moreover, this study investigates the significance of the window size parameters in the Word2Vec approach, revealing that a window size

In [37]:

topic_model.visualize_topics() 

In [38]:
topic_model.visualize_barchart()

In [39]:

topic_model.save("/Users/siri/Documents/GitHub/CrossRefData/bertopic_model")



