In [None]:
from top2vec import Top2Vec
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('6n.csv')

# Inspect the first few rows and columns
print(df.head())
print(df.columns)

                              DOI  \
0       10.47363/jaicc/2024(3)347   
1  10.70715/jitcai.2024.v1.i1.004   
2            10.2139/ssrn.4782875   
3      10.9781/ijimai.2024.02.006   
4            10.2139/ssrn.4614223   

                                               Title  \
0  Impact of AI and GenAI on Healthcare: Security...   
1  The Impact of GenAI on Student Engagement and ...   
2  Knowledge Management Perspective of GenAI (GenAI)   
3  GenAI in Product Design Education: Navigating ...   
4  GenAI Against Humanity: Nefarious Applications...   

                                            Abstract  
0  <jats:p>This paper aims to understand the impa...  
1  <jats:p>The rapid adoption of AI (AI) in highe...  
2                                                NaN  
3                                                NaN  
4                                                NaN  
Index(['DOI', 'Title', 'Abstract'], dtype='object')


In [3]:
# Remove rows where 'Abstract' column has the value 'N/A'
df = df[~df['Abstract'].isin(['N/A']) & df['Abstract'].notna()]

In [5]:
num_rows = len(df)
print(f"Number of rows in the DataFrame: {num_rows}")

Number of rows in the DataFrame: 434


In [6]:
# Define the cleaning function
def clean_html_xml(text):
    # Create a BeautifulSoup object to parse the HTML/XML
    soup = BeautifulSoup(text, 'html.parser')
    # Extract and return text without HTML/XML tags
    return soup.get_text()

In [7]:
df['cleaned_text'] = df['Abstract'].apply(clean_html_xml)

In [8]:
# remove stop words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english")

In [9]:
df.head()

Unnamed: 0,DOI,Title,Abstract,cleaned_text
0,10.47363/jaicc/2024(3)347,Impact of AI and GenAI on Healthcare: Security...,<jats:p>This paper aims to understand the impa...,This paper aims to understand the impact of AI...
1,10.70715/jitcai.2024.v1.i1.004,The Impact of GenAI on Student Engagement and ...,<jats:p>The rapid adoption of AI (AI) in highe...,The rapid adoption of AI (AI) in higher educat...
6,10.7256/2454-0625.2024.6.70926,Socio-cultural risks of multimodal large gener...,<jats:p>\n The article is devoted to the study...,\n The article is devoted to the study of the ...
7,10.51191/issn.2637-1898.2024.7.12.12,AI: Duality in Applications of GenAI and Assis...,<jats:p>This paper explores the multifaceted r...,This paper explores the multifaceted role of A...
9,10.1007/s43681-022-00176-2,Review of the state of the art in autonomous AI,<jats:title>Abstract</jats:title><jats:p>This ...,AbstractThis article presents a new design for...


In [10]:
documents = df['cleaned_text'].tolist()


In [11]:
model = Top2Vec(documents, embedding_model='universal-sentence-encoder')

2024-11-12 16:07:13,798 - top2vec - INFO - Pre-processing documents for training
2024-11-12 16:07:13,958 - top2vec - INFO - Downloading universal-sentence-encoder model
2024-11-12 16:08:06,547 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2024-11-12 16:08:07,011 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2024-11-12 16:08:10,148 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2024-11-12 16:08:10,174 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics


In [12]:
print(model.documents)
print(model.topic_words)


["This paper aims to understand the impact of AI and GenAI on the healthcare sector. As AI grows, it becomes an integral part of the healthcare system's components, such as the diagnosis process, the development of personalized treatments, and improvement in overall patient care. However, such enhancements raise severe concerns regarding data security and the privacy of patients."
 'The rapid adoption of AI (AI) in higher education is reshaping students’ learning experiences, with tools such as GenAI, Grammarly, and Microsoft Copilot becoming integral to academic work. This study, informed by data from the Digital Education Council Global AI Student Survey 2024, examines the impact of AI on students, focusing on usage patterns, trust in AI-generated content, ethical awareness, and expectations for institutional support. Findings indicate that 86% of students use AI for various academic tasks, with a majority expressing concerns about trust, fairness, and over-reliance on AI. While stud

In [13]:
num_topics = model.get_num_topics()
print(f"Number of topics: {num_topics}")


Number of topics: 5


In [14]:
topics = model.get_topics(5)
print(topics)


(array([['ai', 'ethics', 'ethical', 'algorithms', 'technologies', 'human',
        'generative', 'technology', 'moral', 'implications',
        'innovation', 'article', 'concerns', 'decision',
        'considerations', 'approaches', 'academic', 'based', 'explores',
        'risks', 'enhance', 'data', 'digital', 'future', 'challenges',
        'models', 'between', 'paper', 'approach', 'model', 'genai',
        'machine', 'critical', 'about', 'regarding', 'within',
        'assessment', 'analysis', 'development', 'society', 'study',
        'training', 'studies', 'science', 'research', 'insights',
        'impact', 'understanding', 'privacy', 'methods'],
       ['ai', 'algorithms', 'technologies', 'generative', 'ethics',
        'ethical', 'enhance', 'analysis', 'data', 'technology',
        'studies', 'study', 'considerations', 'implications', 'digital',
        'innovation', 'decision', 'approaches', 'human', 'efficiency',
        'paper', 'which', 'approach', 'assessment', 'impact', '

In [None]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=1, num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

In [16]:
num_lines = len(documents)
print(f"Number of lines in the list: {num_lines}")

Number of lines in the list: 434
