In [1]:
import pandas as pd
from bertopic import BERTopic
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [4]:
def load_and_process_data(file_path, model_path):
    """
    Load data, clean text, and load BERTopic model
    
    Parameters:
    file_path (str): Path to the CSV file
    model_path (str): Path to the saved BERTopic model
    
    Returns:
    tuple: (cleaned_docs, topic_model)
    """
    # Load the CSV file with error handling
    try:
        df = pd.read_csv(file_path, on_bad_lines='skip')
    except Exception as e:
        print(f"Error loading CSV: {e}")
        return None, None
    
    # Define the cleaning function
    def clean_html_xml(text):
        if pd.isna(text):  # Handle NaN values
            return ""
        try:
            soup = BeautifulSoup(str(text), 'html.parser')
            return soup.get_text().strip()
        except Exception as e:
            print(f"Error cleaning text: {e}")
            return ""
    
    # Clean the abstracts
    print("Cleaning text data...")
    df['cleaned_text'] = df['Abstract'].apply(clean_html_xml)
    
    # Remove empty strings and get clean documents
    docs = df['cleaned_text'].tolist()
    
    # Load the saved model
    try:
        print("Loading BERTopic model...")
        topic_model = BERTopic.load(model_path)
        print("Model loaded successfully!")
    except FileNotFoundError:
        print("Model file not found. Creating new model...")
        # Initialize new model if saved model doesn't exist
        vectorizer_model = CountVectorizer(stop_words="english")
        topic_model = BERTopic(vectorizer_model=vectorizer_model)
        
    return docs, topic_model

In [5]:
docs, topic_model = load_and_process_data('6n_cleaned.csv', 'bertopic_model')


Cleaning text data...
Loading BERTopic model...
Model loaded successfully!


In [6]:
topic_model.fit(docs)

<bertopic._bertopic.BERTopic at 0x33180ea10>

In [9]:
df = pd.read_csv('6n_cleaned.csv')

topics, _ = topic_model.transform(docs)

# Add the assigned topics as a new column
df['assigned_topic'] = topics

In [10]:
df.to_csv('exported_data.csv', index=False)

In [7]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,115,-1_ai_research_genai_ethical,"[ai, research, genai, ethical, impact, data, u...",[This integrative literature review (ILR) exam...
1,0,108,0_ai_ethical_ethics_moral,"[ai, ethical, ethics, moral, human, developmen...","[Aim: AI systems can be complex and opaque, ma..."
2,1,60,1_students_genai_learning_education,"[students, genai, learning, education, ai, edu...",[Integrating GenAI into education has sparked ...
3,2,40,2_art_ai_creative_content,"[art, ai, creative, content, music, genai, new...",[The integration of GenAI (AI) tools in art an...
4,3,28,3_healthcare_medical_ai_care,"[healthcare, medical, ai, care, patient, data,...",[AI (AI) is revolutionizing the healthcare sec...
5,4,25,4_business_marketing_genai_customer,"[business, marketing, genai, customer, study, ...",[The research paper investigates the comparati...
6,5,18,5_data_gan_generative_network,"[data, gan, generative, network, models, adver...","[<p><span lang=""EN-US"">Research in the field o..."
7,6,15,6_ethical_healthcare_ai_principles,"[ethical, healthcare, ai, principles, issues, ...",[Public and private investments into developin...
8,7,14,7_genai_text_extraction_language,"[genai, text, extraction, language, news, docu...",[This study is an in-depth exploration of the ...
9,8,11,8_security_cybersecurity_cyber_threats,"[security, cybersecurity, cyber, threats, ai, ...",[AbstractThis paper examines the ethical oblig...


In [8]:
topic_model.visualize_heatmap()

In [None]:
import pandas as pd

# Read the first CSV (containing 'doi' of the original dataset)
first_csv = pd.read_csv('exported_data.csv')  # This should contain a column 'doi'

# Read the second CSV (containing 'source_doi' and 'target_doi')
second_csv = pd.read_csv('citation_edge_screened_no_dupes.csv')  # This should contain 'source_doi' and 'target_doi'

# Clean column names by stripping spaces and converting to lowercase
first_csv.columns = first_csv.columns.str.strip().str.lower()
second_csv.columns = second_csv.columns.str.strip().str.lower()

# Merge the second CSV with the first CSV based on 'target_doi'
merged_csv = pd.merge(second_csv, first_csv, left_on='target', right_on='doi', how='left')

# Check if the 'target_doi' has a match in the first CSV; if not, mark as 'N/A'
merged_csv['match_status'] = merged_csv['doi'].apply(lambda x: 'Match' if pd.notna(x) else 'N/A')

# Save the merged result to a new CSV file
merged_csv.to_csv('merged_result1.csv', index=False)
