In [5]:
from bertopic import BERTopic
from bs4 import BeautifulSoup
import pandas as pd

In [21]:
# Load the CSV file into a DataFrame
df = pd.read_csv('3n.csv')

# Inspect the first few rows and columns
print(df.head())
print(df.columns)

                            DOI  \
0     10.47363/jaicc/2024(3)347   
1          10.2139/ssrn.4782875   
2    10.9781/ijimai.2024.02.006   
3          10.2139/ssrn.4614223   
4  10.55982/openpraxis.16.1.654   

                                               Title  \
0  Impact of AI and GenAI on Healthcare: Security...   
1  Knowledge Management Perspective of GenAI (GenAI)   
2  GenAI in Product Design Education: Navigating ...   
3  GenAI Against Humanity: Nefarious Applications...   
4  GenAI et al.: Cocreation, Authorship, Ownershi...   

                                            Abstract  
0  <jats:p>This paper aims to understand the impa...  
1                                                NaN  
2                                                NaN  
3                                                NaN  
4                                                NaN  
Index(['DOI', 'Title', 'Abstract'], dtype='object')


In [22]:
# Remove rows where 'Abstract' column has the value 'N/A'
df = df[~df['Abstract'].isin(['N/A']) & df['Abstract'].notna()]

In [23]:
df.head()

Unnamed: 0,DOI,Title,Abstract
0,10.47363/jaicc/2024(3)347,Impact of AI and GenAI on Healthcare: Security...,<jats:p>This paper aims to understand the impa...
5,10.7256/2454-0625.2024.6.70926,Socio-cultural risks of multimodal large gener...,<jats:p>\n The article is devoted to the study...
8,10.1007/s43681-022-00201-4,Assessing the ethical and social concerns of A...,<jats:title>Abstract</jats:title><jats:p>Ethic...
9,10.1007/s43681-023-00330-4,Ethics by design for AI,<jats:title>Abstract</jats:title><jats:p>In th...
11,10.59728/jaie.2024.3.2.6,Trends and Standardization of AI (AI) Ethics R...,"<jats:p>With the development of AI technology,..."


In [5]:
# Define the cleaning function
def clean_html_xml(text):
    # Create a BeautifulSoup object to parse the HTML/XML
    soup = BeautifulSoup(text, 'html.parser')
    # Extract and return text without HTML/XML tags
    return soup.get_text()

In [6]:
df['cleaned_text'] = df['Abstract'].apply(clean_html_xml)

In [24]:
# Initialize and fit the BERTopic model and remove stop words from the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)

In [9]:
# Initialize and fit the BERTopic model
topic_model = BERTopic(embedding_model = "all-MiniLM-L6-v2")

In [11]:
from transformers import pipeline
from bertopic import BERTopic

# Create the embedding model pipeline
embedding_model = pipeline("feature-extraction", model="cambridgeltl/SapBERT-from-PubMedBERT-fulltext")

# Custom wrapper for the embedding model to include truncation and padding
def embed_documents(documents):
    return embedding_model(documents, truncation=True, padding=True)

# Initialize the BERTopic model with the custom embedding function
topic_model = BERTopic(embedding_model=embed_documents)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [25]:
topics, probs = topic_model.fit_transform(df['Title'])

In [18]:
topics, probs = topic_model.fit_transform(df['cleaned_text'])

In [26]:
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,61,-1_ai_genai_impact_development,"[ai, genai, impact, development, design, using...","[DARPA's Impact on AI, Super AI, GenAI, Narrow..."
1,0,133,0_ai_ethics_ethical_healthcare,"[ai, ethics, ethical, healthcare, concerns, mo...",[Ethics &amp; AI: A Systematic Review on Ethic...
2,1,115,1_ai_impact_management_intelligence,"[ai, impact, management, intelligence, digital...","[Impact Of AI AI in Digital Marketing, The Imp..."
3,2,68,2_genai_data_transforming_research,"[genai, data, transforming, research, impact, ...",[transformative potential of GenAI (GenAI) in...
4,3,24,3_genai_education_using_learning,"[genai, education, using, learning, higher, ed...",[Empowering Education by Developing and Evalua...
5,4,13,4_education_ai_impact_review,"[education, ai, impact, review, technology, le...",[The Impact of AI (AI) on Midwifery Education:...
6,5,12,5_adversarial_generative_network_networks,"[adversarial, generative, network, networks, n...",[Pedestrian detection under weather conditions...


In [28]:
topic_model.get_topic(1)

[('ai', 0.1561869836469344),
 ('impact', 0.09505162204330031),
 ('management', 0.04278399121561092),
 ('intelligence', 0.04055608021175361),
 ('digital', 0.03548657018528441),
 ('machine', 0.03116880159592174),
 ('human', 0.027983015427003145),
 ('control', 0.025637439424174877),
 ('learning', 0.02536929813007842),
 ('social', 0.025347550132346002)]

In [29]:
topic_model.get_representative_docs(6)

In [30]:

topic_model.visualize_topics() 

In [31]:
topic_model.visualize_barchart()