In [None]:
!pip install bertopic==0.16.0
!pip install sentence-transformers==2.2.2
!pip install transformers==4.30.2

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pandas as pd
import re
import string
import pickle
from collections import defaultdict

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


import seaborn as sns
import matplotlib.pyplot as plt

from bertopic import BERTopic

import plotly.io as pio

In [None]:
df = pd.read_csv('llm_2020_2024.csv', index_col=False)

In [None]:
df

In [None]:
df.info()

In [None]:
stopwords = stopwords.words('english')
stopwords.extend(['from', 'best','subject', 're', 'edu','of', 'have', 'not', 'been','however','by','at', 'on', 'to','true','(s).',
                   'verlag', 'trading', 'group', 'solve','bad','high','low','via', 'not','very' 'would', 
                   'allows','provides','say', 'could', '_', 'be', 'know', 'that', 'www', 'youtube', 'com', 'watch', 'objective:',
                   'few', 'good', 'go', 'get', 'do', 'done', 'of','try', 'many', 'some', 'important','importance', 'nice','thank', 
                   'think', 'see', 'rather', 'easy', 'easily', 'lot','lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 
                   'line', 'even', 'also', 'may', 'take', 'come', 'therefore', 'currently', 'some','apply', 'suggest','way','consideration',
                   'relatively', 'due to', 'paper', 'benefit', 'consequently','conclusion','as','suggest', 'bring', 'last','the','year', 
                   'regard', 'propose', 'various', 'present', 'represent','include', 'give', 'ensure', 'work','study', 'author', 'because', 
                   'become', 'large', 'document', 'still', 'yet', 'discuss', 'finally', 'self','as','result','due', 'less', 'define','purpose',
                   'thus', 'certain','non','able', 'show','article', 'example','allow','simple', 'way','obtain','consider','available','several',
                   'provide','than','show', 'called','than','issue','last', 'hence','future','and', 'context', 'such', 'often', 'which',
                   'been', 'yet', 'for', 'based', 'due', 'to', 'limitations','existing', 'worst', 'case', 'one', 'limitation', 'is', 'the', 'lack', 
                   'for', 'are','paper','introduces', 'our', 'vision', 'about', 'how', 'furthermore', 'ieee', 'springer', 'elsevier'])

In [None]:
##  load stopwords
#def get_stopwords():
#    try:
#        with open(PATH_STOPWORDS_TR, 'rb') as data_file:
#            return pickle.load(data_file)
#    except IOError as exc:
#        raise IOError("No such stopwords file! Error: " + str(exc))
#
##  load blacklist
#def get_blacklist():
#    try:
#        with open(PATH_BLACKLIST_TR, 'rb') as data_file:
#            return pickle.load(data_file)
#    except IOError as exc:
#        raise IOError("No such stopwords file! Error: " + str(exc))


#  cleaning stopwords
def clean_stopwords(content):
    # content: str
    content = content.split(" ")
    filtered_list = []
#    stopwords = get_stopwords()
    for word in content:
        if word not in stopwords:
            filtered_list.append(word)

    text = ' '.join(filtered_list)
    return text

##  cleaning blacklist
#def clean_blacklist(content):
#    # content: str
#    # return: str
#    content = content.split(" ")
#    filtered_list = []
#    blacklist = get_blacklist()
#    for word in content:
#        if word not in blacklist:
#            filtered_list.append(word)
#
#    text = ' '.join(filtered_list)
#    return text

#  cleaning URLs
def clean_url(content):
    #  content: str
    #  return: str
    reg_url = r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    pattern_url = re.compile(reg_url)
    content = pattern_url.sub('',content)
    return content

#  cleaning e-mails
def clean_email(content):
    reg_email='\S*@\S*\s?'
    pattern_email = re.compile(reg_email)
    content = pattern_email.sub('',content)
    return content

#  cleaning punctuation
def clean_punctuation(content):
    #regex = re.compile('[%s]' % re.escape(string.punctuation))
    #content = regex.sub(" ", content)
    content = content.translate(content.maketrans(string.punctuation, ' '*len(string.punctuation)))
    return content

#  cleaning digits
def clean_numbers(content):
    remove_digits = str.maketrans('', '', string.digits)
    text = content.translate(remove_digits)
    return text

def listToString(text):
    #  text: string
    #  return: string
    str1 = " "
    return (str1.join(text))

#  cleaning postfix
#  for example: Venezuela'nın ==> Venezuela
def clean_postfix(content):
    #  content: list
    #  return: str
    spesific_punctation = [39, 8217] # ascii codes of spesific punctations
    all_words = defaultdict(list)
    counter = 0
    for i, word in enumerate(content):
        if len(word) > 2:
            if chr(8217) in word:
                replaced_word = word.replace(chr(8217), " ")  #  clean for ’
                replaced_word = replaced_word.split(" ")[0]
                all_words[i].append(replaced_word)
            elif chr(39) in word:
                replaced_word = word.replace(chr(39), " ")  # clean for '
                replaced_word = replaced_word.split(" ")[0]
                all_words[i].append(replaced_word)
            else:
                all_words[i].append(word)
    all_words = sorted(all_words.items())

    text = ""
    for i in range(len(all_words)):
        text = text + " " + all_words[i][1][0]

    return text

def clean_text(content):
    #  text: string
    #  return: string
    content = content.lower()
    cleaned_email = clean_email(content)
    cleaned_email_url = clean_url(cleaned_email)
    cleaned_email_url = listToString(cleaned_email_url.split("."))
    cleaned_email_url = cleaned_email_url.split(" ")
    cleaned_email_url_postfix = clean_postfix(cleaned_email_url)
    cleaned_email_url_postfix_punctuation = clean_punctuation(cleaned_email_url_postfix)
    cleaned_email_url_postfix_punctuation_numbers = clean_numbers(cleaned_email_url_postfix_punctuation)
    cleaned_email_url_postfix_punctuation_numbers_stopwords = clean_stopwords(cleaned_email_url_postfix_punctuation_numbers)
    #cleaned_email_url_postfix_punctuation_numbers_stopwords_blacklist = clean_blacklist(cleaned_email_url_postfix_punctuation_numbers_stopwords)

    filtered_sentence = []
    for word in cleaned_email_url_postfix_punctuation_numbers_stopwords.split(" "):
        if len(word) > 2:
            filtered_sentence.append(word)

    text = ' '.join(filtered_sentence)
    return text

In [None]:
df.info()

In [None]:
print('after drop related string: ', df.shape)
df_cleaned = df[~df['Abstract'].isin(['[No abstract available]'])]
print('after drop related string: ', df_cleaned.shape)

In [None]:
print('before dropna: ', df_cleaned.shape)
df_cleaned.dropna(subset=['Authors', 'Title', 'Abstract'], inplace=True)
print('after dropna: ', df_cleaned.shape)

In [None]:
print('before drop_duplicates: ', df_cleaned.shape)
df_cleaned.drop_duplicates(subset=['Title', 'Abstract'], keep=False, ignore_index=True, inplace=True)
print('after drop_duplicates: ', df_cleaned.shape)

In [None]:
df_cleaned.reset_index(drop=True, inplace=True)

In [None]:
df_cleaned.info()

In [None]:
df_cleaned

In [None]:
df = df_cleaned.copy()

In [None]:
df['Abstract'][0]

In [None]:
cleaning = lambda x: clean_text(x)
df['Abstract_preprocessed'] = df['Abstract'].apply(lambda x: x.split(' © ')[0] if pd.notna(x) else x)
df['Abstract_cleaned'] = df['Abstract_preprocessed'].apply(cleaning)

In [None]:
df['Abstract_cleaned'][0]

In [None]:
df['Abstract']

In [None]:
df['Abstract_cleaned']

In [None]:
data = df.copy()

In [None]:
# Create a new column containing the length each headline text
data["headline_text_len"] = data['Abstract_cleaned'].apply(lambda x : len(x.split()))
print("The longest headline has: {} words".format(data.headline_text_len.max()))

In [None]:
# Visualize the length distribution
sns.displot(data.headline_text_len, kde=False)

In [None]:
for idx in data.sample(3).index:
    headline = data.iloc[idx]
    print("Headline #{}:".format(idx))
    print("Year: {}".format(headline.Year))
    print("Title: {}".format(headline.Title))
    print("Text: {}\n".format(headline.Abstract_cleaned))

In [None]:
df.to_csv('cleaned_LLM_dataset_2020_2024.csv', index=False, encoding='utf8')

In [None]:
"""
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance

main_representation_model_KeyBERT = KeyBERTInspired()
aspect_representation_model1_POS = PartOfSpeech("en_core_web_sm")
aspect_representation_model2_MMR = MaximalMarginalRelevance(diversity=.3)
aspect_representation_model3_KeyBERT_MMR = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]

representation_model = {
    "KeyBERT": main_representation_model_KeyBERT,
    "POS":  aspect_representation_model1_POS,
    "MMR":  aspect_representation_model2_MMR,
    "KeyBERT_MMR":  aspect_representation_model3_KeyBERT_MMR 
}
"""

In [None]:
"""
from bertopic import BERTopic

topic_model = BERTopic(

  nr_topics=21,
    
  # Pipeline models
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics, probs = topic_model.fit_transform(df['Abstract_cleaned'])
"""

In [None]:
#topic_model.reduce_topics(docs=df['Abstract_cleaned'])

In [None]:
topic_model.save("topic_model_bReOut")

In [None]:
topic_model.get_topic_info().to_excel('topic_model_bReOut.xlsx')

In [None]:
docs = df['Abstract_cleaned']

In [None]:
topics2 = topics # protect the "topics" variable

In [None]:
# Reduce outliers
# Reduce outliers using the `embeddings` strategy
new_topics = topic_model.reduce_outliers(docs, topics2, strategy="c-tf-idf")

In [None]:
topic_model.update_topics(docs, topics=new_topics)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.save("topic_model_aReOut")

In [None]:
#topic_model.save("topic_model")

# Load the serialized model
topic_model = BERTopic.load("topic_model_aReOut")
topic_model.get_topic_info()

In [None]:
topic_model.get_topic_info().to_excel('topic_model_aReOut.xlsx')

In [None]:
# Set Topic Label

topic_model.set_topic_labels({0: "Large Language Models",
                              1: "ChatGPT in Medicine", 
                              2: "Task-based Human-Robot Dialogue", 
                              3: "Sentiment Bias", 
                              4: "Generative AI and Law", 
                              5: "Code Generation", 
                              6: "Visual Language Models", 
                              7: "Generating Game Narrative Design", 
                              8: "Security Vulnerabilities", 
                              9: "Recommendation Systems", 
                              10: "Quantization of LLMs", 
                              11: "SQL Generation with LLMs", 
                              12: "System Requirements for LLMs",
                              13: "LLMs for Mental Health",                               
                              14: "LLMs in Biology", 
                              15: "Financial Text Models", 
                              16: "Autonomous Driving with LLMs", 
                              17: "LLMs in Material Science", 
                              18: "Audio Speech Recognition with LLMs", 
                              19: "LLMs in Psychcology"})

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.get_topic_info().to_excel('topics_models_CustomName.xlsx')

In [None]:
topic_model.visualize_barchart(top_n_topics=10, n_words=10, width=300, custom_labels=True,
                              title='Topic Keyword Scores of the Top 10 Topics')

In [None]:
fig = topic_model.visualize_barchart(top_n_topics=10, n_words=10, width=300, custom_labels=True,
                              title='Topic Keyword Scores of the Top 10 Topics')
pio.write_image(fig, 'Topic Keyword Scores of the Top 10 Dominant Topics.png', scale=2)

In [None]:
topic_model.visualize_hierarchy(custom_labels=True, top_n_topics=10,
                              title='Hierarchical Clustering of Top 10 Topics')

In [None]:
fig = topic_model.visualize_hierarchy(custom_labels=True, top_n_topics=10,
                              title='Hierarchical Clustering of Top 10 Topics') #top_n_topics=10, 
pio.write_image(fig, 'Hierarchical Clustering of Top 10 Topics.png', scale=2)

In [None]:
# Extract hierarchical topics and their representations
hierarchical_topics = topic_model.hierarchical_topics(docs)

# Visualize these representations
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=True,
                              title='Hierarchical Clustering of All Topics')

In [None]:
fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=True,
                              title='Hierarchical Clustering of All Topics')
pio.write_image(fig, 'Hierarchical Clustering of All Topics.png', scale=2)

In [None]:
topic_model.get_document_info(docs).to_excel('dataset_llm_2020_2024.xlsx')

In [None]:
topic_model.get_document_info(docs).to_csv('dataset_llm_2020_2024.csv')

In [None]:
topics_num = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
topic_model.visualize_documents(topics=topics_num, docs=docs, custom_labels=True,
                              title='Visualization of Each Document within the Top 10 Dominant Topics')

In [None]:
topics_num = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
fig = topic_model.visualize_documents(topics=topics_num, docs=docs, custom_labels=True,
                              title='Visualization of Each Document within the Top 10 Dominant Topics')
pio.write_image(fig, 'Visualization of Each Document within the Top 10 Dominant Topics.png', scale=2)

In [None]:
topic_model.visualize_heatmap(custom_labels=True, width=1000,
                            title='Similarity Matrix of All Topics')

In [None]:
fig = topic_model.visualize_heatmap(custom_labels=True, width=1000,
                            title='Similarity Matrix of All Topics')
pio.write_image(fig, 'Similarity Matrix of All Topics.png', scale=2)

In [None]:
topic_model.visualize_heatmap(top_n_topics=10, custom_labels=True, width=1000,
                            title='Similarity Matrix of Top 10 Topics')

In [None]:
fig = topic_model.visualize_heatmap(top_n_topics=10, custom_labels=True, width=1000,
                            title='Similarity Matrix of Top 10 Topics')
pio.write_image(fig, 'Similarity Matrix of Top 10 Topics.png', scale=2)

In [None]:
timestamps = df.Year.to_list()
abstracts = df.Abstract_cleaned.to_list()

# Create topics over time
#model = BERTopic(verbose=True)
topics_over_time = topic_model.topics_over_time(abstracts, timestamps)

In [None]:
fig=topic_model.visualize_topics_over_time(custom_labels=True, topics_over_time=topics_over_time, top_n_topics=10, 
                              title='Distribution of the Top 10 Dominant Topics Over Time')

fig.update_xaxes(type="category")

In [None]:
fig=topic_model.visualize_topics_over_time(custom_labels=True, topics_over_time=topics_over_time, top_n_topics=10, 
                              title='Distribution of the Top 10 Dominant Topics Over Time')

fig.update_xaxes(type="category")

pio.write_image(fig, 'Distribution of the Top 10 Dominant Topics Over Time.png', scale=2)

In [None]:
topic_model.visualize_term_rank(custom_labels=True,
                              title='Term Score Decline per All Topics')

In [None]:
fig = topic_model.visualize_term_rank(custom_labels=True,
                              title='Term Score Decline per All Topics')
pio.write_image(fig, 'Term Score Decline per All Topics.png', scale=2)

In [None]:
topics_num = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
topic_model.visualize_term_rank(topics=topics_num, custom_labels=True,
                              title='Term Score Decline per Top 10 Topics')

In [None]:
topics_num = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

fig = topic_model.visualize_term_rank(topics=topics_num, custom_labels=True,
                              title='Term Score Decline per Top 10 Topics')
pio.write_image(fig, 'Term Score Decline per Top 10 Topics.png', scale=2)

In [None]:
#topics = topic_model.topics_
probabilities = topic_model.probabilities_

In [None]:
topic_model.visualize_distribution(probabilities=probabilities)

In [None]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)

# Visualize the token-level distributions
dfT = topic_model.visualize_approximate_distribution(docs[0], topic_token_distr[0])
dfT