In [1]:
from bertopic import BERTopic
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# Load the CSV file into a DataFrame
df = pd.read_csv('crossref_data.csv')

# Inspect the first few rows and columns
print(df.head())
print(df.columns)

                          DOI  \
0  10.1007/s43681-024-00521-7   
1  10.1007/s43681-023-00315-3   
2        10.2139/ssrn.4784555   
3  10.1007/s43681-024-00519-1   
4     10.1201/9781032654829-2   

                                               Title  \
0  Ethics and the use of generative AI in profess...   
1  Does the sun rise for ChatGPT? Scientific disc...   
2  The Ethics of Generative Ai in Social Science ...   
3  A powerful potion for a potent problem: transf...   
4  Generative Artificial Intelligence: Introducti...   

                                            Abstract  
0  <jats:title>Abstract</jats:title><jats:p>Gener...  
1                                                NaN  
2                                                NaN  
3  <jats:title>Abstract</jats:title><jats:p>Gener...  
4                                                NaN  
Index(['DOI', 'Title', 'Abstract'], dtype='object')


In [5]:
# Remove rows where 'Abstract' column has the value 'N/A'
df = df[~df['Abstract'].isin(['N/A']) & df['Abstract'].notna()]

In [12]:
df.head()

Unnamed: 0,DOI,Title,Abstract,cleaned_text
0,10.1007/s43681-024-00521-7,Ethics and the use of generative AI in profess...,<jats:title>Abstract</jats:title><jats:p>Gener...,AbstractGenerative artificial intelligence (Gn...
3,10.1007/s43681-024-00519-1,A powerful potion for a potent problem: transf...,<jats:title>Abstract</jats:title><jats:p>Gener...,AbstractGenerative Artificial Intelligence (AI...
5,10.4018/979-8-3693-8557-9.ch009,Generative AI for Cybersecurity,<jats:p>The intersection of cybersecurity and ...,The intersection of cybersecurity and generati...
10,10.4018/979-8-3693-8557-9.ch002,Generative AI,"<jats:p>For nearly 50 years, artificial intell...","For nearly 50 years, artificial intelligence (..."
13,10.1007/s43681-024-00443-4,AI hype as a cyber security risk: the moral re...,<jats:title>Abstract</jats:title><jats:p>This ...,AbstractThis paper examines the ethical obliga...


In [7]:
# Define the cleaning function
def clean_html_xml(text):
    # Create a BeautifulSoup object to parse the HTML/XML
    soup = BeautifulSoup(text, 'html.parser')
    # Extract and return text without HTML/XML tags
    return soup.get_text()

In [8]:
df['cleaned_text'] = df['Abstract'].apply(clean_html_xml)

In [8]:
# Initialize and fit the BERTopic model and remove stop words from the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)

In [9]:
# Initialize and fit the BERTopic model
topic_model = BERTopic(embedding_model = "all-MiniLM-L6-v2")

In [14]:
from transformers import pipeline
from bertopic import BERTopic

# Create the embedding model pipeline
embedding_model = pipeline("feature-extraction", model="cambridgeltl/SapBERT-from-PubMedBERT-fulltext")

# Custom wrapper for the embedding model to include truncation and padding
def embed_documents(documents):
    return embedding_model(documents, truncation=True, padding=True)

# Initialize the BERTopic model with the custom embedding function
topic_model = BERTopic(embedding_model=embed_documents)

# Fit and transform your data
topics, probs = topic_model.fit_transform(df['cleaned_text'])


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [15]:
topics, probs = topic_model.fit_transform(df['Title'])

In [17]:
topics, probs = topic_model.fit_transform(df['cleaned_text'])

In [18]:
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,15,-1_cloud_computing_genai_edge,"[cloud, computing, genai, edge, and, the, syst...",[<p>As a specific category of artificial intel...
1,0,1331,0_the_and_of_to,"[the, and, of, to, in, ai, for, this, that, is]",[\nBackground\nThe integration of artificial i...
2,1,30,1_the_and_of_to,"[the, and, of, to, cybersecurity, in, security...",[This chapter explores the burgeoning role of ...


In [None]:
topic_model.get_topic(2)

In [None]:
topic_model.get_representative_docs(12)

In [None]:

topic_model.visualize_topics() 

In [None]:
topic_model.visualize_barchart()