In [3]:
import pandas as pd
import numpy as np
import sqlite3

# Assuming irrelevant_topics contains the topic numbers you identified
irrelevant_topics = [91, 76, 89, 42, 34, 3, 67, 66, 63, 61, 36, 13, 5, 59, 87, 28, 2, 79, 77, 80, 94, 82, 49, 51, 99, 75, 102, 53, 84, 29, 38, 35]

# Load the cleaned public dataset and document-topic associations
db_path = "/Users/vesper/Desktop/LSE/Capstone Project/dissertation/arctic_shift/filtered_data/relevant_data.db"
conn = sqlite3.connect(db_path)

public_data = pd.read_sql_query("SELECT * FROM cleaned_public_data", conn)
public_reduced_df = pd.read_sql_query("SELECT * FROM reduced_public_embeddings", conn)
public_doc_topic_df = pd.read_sql_query("SELECT * FROM public_doc_topic", conn)

# Filter out irrelevant documents
relevant_doc_ids = public_doc_topic_df[~public_doc_topic_df['topic'].isin(irrelevant_topics)]['id']
filtered_documents_df = public_data[public_data['id'].isin(relevant_doc_ids)].reset_index(drop=True)
filtered_embeddings_df = public_reduced_df[public_reduced_df['id'].isin(relevant_doc_ids)].reset_index(drop=True)

# Store the filtered data
filtered_documents_df.to_sql('filtered_public_data', conn, if_exists='replace', index=False)
filtered_embeddings_df.to_sql('filtered_public_embeddings', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()


In [4]:
import os
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
import plotly.io as pio

# Re-open the connection
conn = sqlite3.connect(db_path)

# Load the filtered public dataset and embeddings
filtered_public_data = pd.read_sql_query("SELECT * FROM filtered_public_data", conn)
filtered_public_embeddings = pd.read_sql_query("SELECT * FROM filtered_public_embeddings", conn)

# Convert to the required format
filtered_documents = filtered_public_data['cleaned_text'].tolist()
filtered_embeddings = filtered_public_embeddings.drop(columns=['id']).values

# Close the connection
conn.close()

# Initialize HDBSCAN model with adjusted parameters
hdbscan_model_public = HDBSCAN(min_cluster_size=35, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Initialize ClassTfidfTransformer with bm25_weighting and reduce_frequent_words
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

# Initialize the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create your representation models
representation_model_keybert = KeyBERTInspired()
representation_model_mmr = MaximalMarginalRelevance(diversity=0.3)
representation_models = [representation_model_keybert, representation_model_mmr]

# Initialize BERTopic model for public data
topic_model_public = BERTopic(
    embedding_model=embedding_model,
    representation_model=representation_models,
    hdbscan_model=hdbscan_model_public,
    ctfidf_model=ctfidf_model
)

# Fit the model
public_topics, public_probs = topic_model_public.fit_transform(filtered_documents, embeddings=filtered_embeddings)

# Fine-tune topic representations using CountVectorizer
vectorizer_public = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=20)

# Update topics for public data
topic_model_public.update_topics(filtered_documents, vectorizer_model=vectorizer_public)

# Extract topic information for public data
public_topics_info = topic_model_public.get_topic_info()
public_topics_info = public_topics_info[['Topic', 'Count', 'Name', 'Representation', 'Representative_Docs']]

# Convert to DataFrame for public data
public_topics_df = pd.DataFrame(public_topics_info)

# Convert complex columns to strings to avoid SQLite errors
public_topics_df['Representation'] = public_topics_df['Representation'].astype(str)
public_topics_df['Representative_Docs'] = public_topics_df['Representative_Docs'].astype(str)

# Display public topics
print("Public Topics:")
print(public_topics_df.to_string(index=False))

# Save public topics to CSV
public_topics_df.to_csv("updated_public_topics.csv", index=False)

# Track document-topic associations for public data
public_doc_topic_df = pd.DataFrame({
    'id': filtered_public_data['id'],
    'document': filtered_documents,
    'topic': public_topics
})

# Save the document-topic associations to new tables in the database
conn = sqlite3.connect(db_path)
print("Saving public document-topic associations to the database...")
public_doc_topic_df.to_sql('updated_public_doc_topic', conn, if_exists='replace', index=False)
print("Saving public topics to the database...")
public_topics_df.to_sql('updated_public_topics', conn, if_exists='replace', index=False)

# Extract hierarchical topics for public data
print("Extracting hierarchical topics for public data...")
public_hierarchical_topics = topic_model_public.hierarchical_topics(filtered_documents)

# Visualize hierarchical topics for public data
print("Visualizing hierarchical topics for public data...")
pio.renderers.default = "notebook_connected"  # Ensure it uses notebook renderer
public_hierarchy_fig = topic_model_public.visualize_hierarchy(hierarchical_topics=public_hierarchical_topics)
public_hierarchy_fig.show()

# Get the hierarchical topic tree for public data
print("Getting hierarchical topic tree for public data...")
public_topic_tree = topic_model_public.get_topic_tree(public_hierarchical_topics)
print("Public Data Topic Tree:\n", public_topic_tree)

conn.close()
print("Topics and document-topic associations saved to the SQLite database.")

Public Topics:
 Topic  Count                                        Name                                                                                                                    Representation                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

100%|██████████| 75/75 [00:00<00:00, 88.54it/s]


Visualizing hierarchical topics for public data...


Getting hierarchical topic tree for public data...
Public Data Topic Tree:
 .
├─ai_people_llm_like_human
│    ├─ai_openai_research_people_companies
│    │    ├─openai_apple_google_research_models
│    │    │    ├─■──apple_device_models_devices_research ── Topic: 53
│    │    │    └─openai_google_research_models_microsoft
│    │    │         ├─■──openai_2023_2024_billion_models ── Topic: 47
│    │    │         └─■──openai_google_research_microsoft_news ── Topic: 10
│    │    └─ai_research_people_companies_just
│    │         ├─ai_people_research_companies_safety
│    │         │    ├─safety_ai safety_research_people_ai
│    │         │    │    ├─■──safety_ai safety_research_people_ai ── Topic: 54
│    │         │    │    └─■──safety_ai safety_research_regulations_team ── Topic: 59
│    │         │    └─ai_people_research_companies_just
│    │         │         ├─research_ai research_ai_paper_scientist
│    │         │         │    ├─research_ai research_ai_paper_like
│    │         │   

In [9]:
import pandas as pd
import sqlite3

# Assuming irrelevant_topics contains the topic numbers you identified
irrelevant_topics = [50, 59, 54, 19, 37, 58, 55, 63, 30, 4, 45]

# Load the cleaned public dataset and document-topic associations
db_path = "/Users/vesper/Desktop/LSE/Capstone Project/dissertation/arctic_shift/filtered_data/relevant_data.db"
conn = sqlite3.connect(db_path)

public_data = pd.read_sql_query("SELECT * FROM filtered_public_data", conn)
public_reduced_df = pd.read_sql_query("SELECT * FROM filtered_public_embeddings", conn)
public_doc_topic_df = pd.read_sql_query("SELECT * FROM updated_public_doc_topic", conn)

# Filter out irrelevant documents
relevant_doc_ids = public_doc_topic_df[~public_doc_topic_df['topic'].isin(irrelevant_topics)]['id']
filtered_documents_df = public_data[public_data['id'].isin(relevant_doc_ids)].reset_index(drop=True)
filtered_embeddings_df = public_reduced_df[public_reduced_df['id'].isin(relevant_doc_ids)].reset_index(drop=True)

# Store the filtered data
filtered_documents_df.to_sql('filtered_public_data', conn, if_exists='replace', index=False)
filtered_embeddings_df.to_sql('filtered_public_embeddings', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()

import os
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
import plotly.io as pio

# Re-open the connection
conn = sqlite3.connect(db_path)

# Load the filtered public dataset and embeddings
filtered_public_data = pd.read_sql_query("SELECT * FROM filtered_public_data", conn)
filtered_public_embeddings = pd.read_sql_query("SELECT * FROM filtered_public_embeddings", conn)

# Convert to the required format
filtered_documents = filtered_public_data['cleaned_text'].tolist()
filtered_embeddings = filtered_public_embeddings.drop(columns=['id']).values

# Close the connection
conn.close()


In [11]:
# Initialize HDBSCAN model with adjusted parameters
hdbscan_model_public = HDBSCAN(min_cluster_size=25, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Initialize ClassTfidfTransformer with bm25_weighting and reduce_frequent_words
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

# Initialize the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Create your representation models
representation_model_keybert = KeyBERTInspired()
representation_model_mmr = MaximalMarginalRelevance(diversity=0.3)
representation_models = [representation_model_keybert, representation_model_mmr]

# Initialize BERTopic model for public data
topic_model_public = BERTopic(
    embedding_model=embedding_model,
    representation_model=representation_models,
    hdbscan_model=hdbscan_model_public,
    ctfidf_model=ctfidf_model
)

# Fit the model
public_topics, public_probs = topic_model_public.fit_transform(filtered_documents, embeddings=filtered_embeddings)

# Fine-tune topic representations using CountVectorizer
vectorizer_public = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=20)

# Update topics for public data
topic_model_public.update_topics(filtered_documents, vectorizer_model=vectorizer_public)

# Extract topic information for public data
public_topics_info = topic_model_public.get_topic_info()
public_topics_info = public_topics_info[['Topic', 'Count', 'Name', 'Representation', 'Representative_Docs']]

# Convert to DataFrame for public data
public_topics_df = pd.DataFrame(public_topics_info)

# Convert complex columns to strings to avoid SQLite errors
public_topics_df['Representation'] = public_topics_df['Representation'].astype(str)
public_topics_df['Representative_Docs'] = public_topics_df['Representative_Docs'].astype(str)

# Display public topics
print("Public Topics:")
print(public_topics_df.to_string(index=False))

# Save public topics to CSV
public_topics_df.to_csv("updated_public_topics.csv", index=False)

# Track document-topic associations for public data
public_doc_topic_df = pd.DataFrame({
    'id': filtered_public_data['id'],
    'document': filtered_documents,
    'topic': public_topics
})

# Save the document-topic associations to new tables in the database
conn = sqlite3.connect(db_path)
print("Saving public document-topic associations to the database...")
public_doc_topic_df.to_sql('updated_public_doc_topic', conn, if_exists='replace', index=False)
print("Saving public topics to the database...")
public_topics_df.to_sql('updated_public_topics', conn, if_exists='replace', index=False)

# Extract hierarchical topics for public data
print("Extracting hierarchical topics for public data...")
public_hierarchical_topics = topic_model_public.hierarchical_topics(filtered_documents)

# Visualize hierarchical topics for public data
print("Visualizing hierarchical topics for public data...")
pio.renderers.default = "notebook_connected"  # Ensure it uses notebook renderer
public_hierarchy_fig = topic_model_public.visualize_hierarchy(hierarchical_topics=public_hierarchical_topics)
public_hierarchy_fig.show()

# Get the hierarchical topic tree for public data
print("Getting hierarchical topic tree for public data...")
public_topic_tree = topic_model_public.get_topic_tree(public_hierarchical_topics)
print("Public Data Topic Tree:\n", public_topic_tree)

conn.close()
print("Topics and document-topic associations saved to the SQLite database.")

Public Topics:
 Topic  Count                                      Name                                                                                                             Representation                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

100%|██████████| 42/42 [00:00<00:00, 110.49it/s]


Visualizing hierarchical topics for public data...


Getting hierarchical topic tree for public data...
Public Data Topic Tree:
 .
├─chatgpt_use_ai_just_like
│    ├─chatgpt_use_like_just_students
│    │    ├─■──email_emails_student_written_ai ── Topic: 36
│    │    └─chatgpt_use_just_like_students
│    │         ├─ai_assignment_student_essay_writing
│    │         │    ├─■──grammar_used_writing_ai_policy ── Topic: 28
│    │         │    └─ai_assignment_student_essay_writing
│    │         │         ├─assignment_ai_student_essay_paper
│    │         │         │    ├─assignment_ai_student_essay_writing
│    │         │         │    │    ├─■──assignment_paper_zero_grade_professor ── Topic: 12
│    │         │         │    │    └─ai_student_assignment_essay_writing
│    │         │         │    │         ├─■──ai_student_assignment_essay_prove ── Topic: 5
│    │         │         │    │         └─■──student_students_plagiarism_assignment_essay ── Topic: 16
│    │         │         │    └─■──essay_essays_ai_paper_writing ── Topic: 23
│    │   