In [1]:
from bertopic import BERTopic
from bs4 import BeautifulSoup
import pandas as pd

In [6]:
# Load the CSV file into a DataFrame
df = pd.read_csv('6n.csv', error_bad_lines=False)

# Inspect the first few rows and columns
print(df.head())
print(df.columns)

                              DOI  \
0       10.47363/jaicc/2024(3)347   
1  10.70715/jitcai.2024.v1.i1.004   
2            10.2139/ssrn.4782875   
3      10.9781/ijimai.2024.02.006   
4            10.2139/ssrn.4614223   

                                               Title  \
0  Impact of AI and GenAI on Healthcare: Security...   
1  The Impact of GenAI on Student Engagement and ...   
2  Knowledge Management Perspective of GenAI (GenAI)   
3  GenAI in Product Design Education: Navigating ...   
4  GenAI Against Humanity: Nefarious Applications...   

                                            Abstract  
0  <jats:p>This paper aims to understand the impa...  
1  <jats:p>The rapid adoption of AI (AI) in highe...  
2                                                NaN  
3                                                NaN  
4                                                NaN  
Index(['DOI', 'Title', 'Abstract'], dtype='object')


In [7]:
# Remove rows where 'Abstract' column has the value 'N/A'
df = df[~df['Abstract'].isin(['N/A']) & df['Abstract'].notna()]

In [8]:
df.head()

Unnamed: 0,DOI,Title,Abstract
0,10.47363/jaicc/2024(3)347,Impact of AI and GenAI on Healthcare: Security...,<jats:p>This paper aims to understand the impa...
1,10.70715/jitcai.2024.v1.i1.004,The Impact of GenAI on Student Engagement and ...,<jats:p>The rapid adoption of AI (AI) in highe...
6,10.7256/2454-0625.2024.6.70926,Socio-cultural risks of multimodal large gener...,<jats:p>\n The article is devoted to the study...
7,10.51191/issn.2637-1898.2024.7.12.12,AI: Duality in Applications of GenAI and Assis...,<jats:p>This paper explores the multifaceted r...
9,10.1007/s43681-022-00176-2,Review of the state of the art in autonomous AI,<jats:title>Abstract</jats:title><jats:p>This ...


In [10]:
# Define the cleaning function
def clean_html_xml(text):
    # Create a BeautifulSoup object to parse the HTML/XML
    soup = BeautifulSoup(text, 'html.parser')
    # Extract and return text without HTML/XML tags
    return soup.get_text()

In [11]:
df['cleaned_text'] = df['Abstract'].apply(clean_html_xml)

In [20]:
# Initialize and fit the BERTopic model and remove stop words from the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)

In [15]:
# Initialize and fit the BERTopic model
topic_model = BERTopic(embedding_model = "all-MiniLM-L6-v2")

In [18]:
from transformers import pipeline
from bertopic import BERTopic

# Create the embedding model pipeline
embedding_model = pipeline("feature-extraction", model="cambridgeltl/SapBERT-from-PubMedBERT-fulltext")

# Custom wrapper for the embedding model to include truncation and padding
def embed_documents(documents):
    return embedding_model(documents, truncation=True, padding=True)

# Initialize the BERTopic model with the custom embedding function
topic_model = BERTopic(embedding_model=embed_documents)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [21]:
topics, probs = topic_model.fit_transform(df['Title'])

In [23]:
topics, probs = topic_model.fit_transform(df['cleaned_text'])

In [24]:
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,113,-1_ai_genai_research_ethical,"[ai, genai, research, ethical, use, data, syst...",[The integration of AI (AI) into Library and I...
1,0,116,0_ai_ethical_ethics_moral,"[ai, ethical, ethics, moral, human, developmen...","[Aim: AI systems can be complex and opaque, ma..."
2,1,54,1_students_genai_learning_education,"[students, genai, learning, education, ai, edu...",[Integrating GenAI into education has sparked ...
3,2,31,2_art_ai_music_new,"[art, ai, music, new, content, creative, aigen...",[With the rapid development of information soc...
4,3,28,3_healthcare_medical_ai_care,"[healthcare, medical, ai, care, patient, data,...",[AI (AI) is revolutionizing the healthcare sec...
5,4,25,4_business_marketing_customer_genai,"[business, marketing, customer, genai, study, ...",[The research paper investigates the comparati...
6,5,18,5_data_gan_network_generative,"[data, gan, network, generative, adversarial, ...","[<p><span lang=""EN-US"">Research in the field o..."
7,6,15,6_ethical_healthcare_ai_principles,"[ethical, healthcare, ai, principles, issues, ...",[Public and private investments into developin...
8,7,12,7_extraction_text_news_document,"[extraction, text, news, document, genai, lang...",[This study is an in-depth exploration of the ...
9,8,12,8_ai_hrm_management_economy,"[ai, hrm, management, economy, work, meaningfu...",[This research discusses the impact of the int...


In [None]:
topic_model.get_topic(1)

In [None]:
topic_model.get_representative_docs(6)

In [None]:

topic_model.visualize_topics() 

In [None]:
topic_model.visualize_barchart()

In [25]:

topic_model.save("/Users/siri/Documents/GitHub/CrossRefData/bertopic_model")



