In [1]:
import os
from constants import *

os.environ["PPLX_API_KEY"] = PPLX_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY

In [105]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
from langchain_openai import OpenAIEmbeddings
import datamapplot

import pandas as pd
import numpy as np
import asyncio

In [3]:
import ast
example_df = pd.read_csv("climate.csv")

example_df["extraction"] = example_df["extraction"].apply(ast.literal_eval) # bodge to deal with csv storage
example_df["full_text"] = example_df['post_title'] + "\n\n" + example_df['self_text']

In [43]:
example_df[:3]

Unnamed: 0.1,Unnamed: 0,comment_id,score,self_text,subreddit,created_time,post_id,author_name,controversiality,ups,...,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received,post_created_time,extraction,full_text
0,337451,kfzmufh,12,"Nothing in particular, just that I've noticed ...",climate,2024-01-02 15:56:17,18wpy55,blackcatwizard,0,12,...,30563.0,221,,Hannah Ritchie: ‘Doomsday predictions are a dr...,0.87,221,0,2024-01-02 13:54:01,[Skepticism towards doomsday predictions and t...,Hannah Ritchie: ‘Doomsday predictions are a dr...
1,35169,l5xf5c3,12,Most people wanna shout about ‘morals!’ Until ...,climate,2024-05-27 20:02:32,1d1qnci,HumanityHasFailedUs,0,12,...,26298.0,464,,World has ‘moral responsibility’ to help small...,0.95,464,0,2024-05-27 12:43:00,"[People often express moral outrage, but are u...",World has ‘moral responsibility’ to help small...
2,24067,l6kgwb6,7,Man I hope Biden wins.,climate,2024-06-01 02:19:27,1d53m9f,Squibbles01,0,7,...,85978.0,3051,,Project 2025 plans to dismantle the federal ag...,0.96,3051,0,2024-05-31 19:13:02,[Criticism of right-wing media's attacks on cl...,Project 2025 plans to dismantle the federal ag...


## Text Processing
---

In [4]:
full_text = example_df["full_text"].to_list()
full_text[0]

"Hannah Ritchie: ‘Doomsday predictions are a dream for climate deniers’ | Climate crisis\n\nNothing in particular, just that I've noticed it several times and I think twice in this sub."

In [5]:
import re
def clean_text(text):
    # keep handles for model
    #text = re.sub("@\w+", "", text) # Remove handle   

    text = re.sub("[^\x20-\x7E]", "", text) # limit to ascii
    text = re.sub('http\S+', "", text)  # remove https

    return text

In [6]:
## Baseline doc sentence segmentation appraoch

"""
nlp = spacy.load("en_core_web_sm")

cleaned_text = [clean_text(c) for c in full_text]  # Clean the text

tokenized_text = []
for t in cleaned_text:
    t_nlp = nlp(t)
    tokenized_text += [s.text for s in t_nlp.sents]
"""

## Baseline doc semantic chunking appraoch

from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings  # for speed; or use OpenAI

embedding_model = FastEmbedEmbeddings()
segchunker = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile")

cleaned_text = [clean_text(c) for c in full_text] 
tokenized_text = segchunker.create_documents(cleaned_text)
tokenized_text = [d.page_content for d in tokenized_text]
tokenized_text[:3]


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

["Hannah Ritchie: Doomsday predictions are a dream for climate deniers | Climate crisisNothing in particular, just that I've noticed it several times and I think twice in this sub.",
 'World has moral responsibility to help small island states survive climate crisis  UN agency chief | Global developmentMost people wanna shout about morals! Until it requires them to actually do something, then not so much.',
 'Everyone wants change, no one wants to change. Said someone, somewhere, probably.']

In [7]:
# LLM extracted docs
docs = example_df["extraction"].to_list()
docs = [d for ds in docs for d in ds]  # flatten
docs[:3]

['Skepticism towards doomsday predictions and their potential impact on addressing the climate crisis.',
 'Possibly critical of the use of doomsday predictions as a means to raise awareness about climate change.',
 'People often express moral outrage, but are unwilling to take concrete actions.']

## Clustering
---

In [32]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

base_embed = embedding_model.embed_documents(tokenized_text)
embed = embedding_model.embed_documents(docs)
base_embed = np.array(base_embed)
embed = np.array(embed)

In [125]:
### Semantic Chunks

# Text Statistics
print("Num Sentences:", len(tokenized_text))
print("Chunk Length Statistics")
print(pd.Series(tokenized_text).apply(len).describe())

# Calculate Topics
base_cluster_model = HDBSCAN(min_cluster_size=5, approx_min_span_tree=False)
base_topic_model = BERTopic(hdbscan_model=base_cluster_model, embedding_model=embedding_model)
base_topics, base_probs = base_topic_model.fit_transform(tokenized_text, embeddings=base_embed)

# Reduce Outliers
#base_new_topics = base_topic_model.reduce_outliers(tokenized_text, base_topics, strategy="c-tf-idf")
#base_topic_model.update_topics(tokenized_text, topics=base_new_topics)

Num Sentences: 2856
Chunk Length Statistics
count    2856.000000
mean      234.153711
std       248.449295
min         0.000000
25%       110.750000
50%       182.000000
75%       286.000000
max      4621.000000
dtype: float64


In [121]:
### LLM Simplified

# Text Statistics
print("Num Statements:", len(docs))
print("LLM Extracted Text Statistics:")
print(pd.Series(docs).apply(len).describe())

# Calculate Topics
cluster_model = HDBSCAN(min_cluster_size=10, approx_min_span_tree=False)
topic_model = BERTopic(hdbscan_model=cluster_model, embedding_model=embedding_model)
topics, probs = topic_model.fit_transform(docs, embeddings=embed)

# Reduce Outliers
#new_topics = topic_model.reduce_outliers(tokenized_text, base_topics, strategy="c-tf-idf")
#topic_model.update_topics(tokenized_text, topics=base_new_topics)

Num Statements: 6416
LLM Extracted Text Statistics:
count    6416.000000
mean       85.762001
std        27.622964
min        14.000000
25%        66.000000
50%        82.000000
75%       101.000000
max       253.000000
dtype: float64


In [135]:
base_topic_info = base_topic_model.get_topic_info()
base_topic_info.to_html("figures/seg-topic-info.html")
base_topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,694,-1_and_of_the_to,"[and, of, the, to, we, in, it, are, that, all]",[history has also angered environmentalists by...
1,0,75,0_5c_warming_global_2100,"[5c, warming, global, 2100, goal, degrees, top...",[Efforts to keep global heating to 1.5C will e...
2,1,57,1_thanks_go_excusing_appropriate,"[thanks, go, excusing, appropriate, aggghggghh...",[Feel free to share it widely wherever its app...
3,2,44,2_cop28_fossil_fuels_phaseout,"[cop28, fossil, fuels, phaseout, deal, saudi, ...","[Even if COP28 fails, it has changed the conve..."
4,3,42,3_biden_president_trump_his,"[biden, president, trump, his, agenda, contras...",[Biden Vows to Save the Planet From the Climat...
...,...,...,...,...,...
154,153,6,153_breaks_officials_co_records,"[breaks, officials, co, records, tell, trying,...",[Earth breaks heat and CO records once again: ...
155,154,5,154_news_gathered_regarding_weeksthanks,"[news, gathered, regarding, weeksthanks, asymp...",[I've gathered up all the positive Climate New...
156,155,5,155_cents_panels_per_panel,"[cents, panels, per, panel, bill, electricity,...","[Would you buy a water heater for $115,000? An..."
157,156,5,156_2050_nothing_look_australia,"[2050, nothing, look, australia, earth, change...",[What Earth Could Look Like in 2050 If We Do N...


In [123]:
topic_info = topic_model.get_topic_info()
topic_info.to_html("figures/llm-topic-info.html")
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2083,-1_the_of_that_is,"[the, of, that, is, to, and, author, by, on, c...",[The focus is on the potential consequences of...
1,0,287,0_meat_plant_based_animal,"[meat, plant, based, animal, food, veganism, d...",[Plant-based meat is a simple and effective so...
2,1,135,1_concerned_author_potential_about,"[concerned, author, potential, about, experien...",[The author is concerned about the potential f...
3,2,133,2_humorous_tone_humor_lighthearted,"[humorous, tone, humor, lighthearted, sarcasti...",[Humorous and lighthearted tone in the comment...
4,3,115,3_optimistic_hope_hopeful_positive,"[optimistic, hope, hopeful, positive, progress...",[The author is optimistic about the potential ...
...,...,...,...,...,...
121,120,11,120_disappointed_frustrated_seems_questioning,"[disappointed, frustrated, seems, questioning,...",[Possibly frustrated or disappointed by the no...
122,121,11,121_degrees_warming_keeping_2c,"[degrees, warming, keeping, 2c, 5c, possibilit...",[The comment is skeptical about the assumption...
123,122,11,122_solely_public_failures_responsible,"[solely, public, failures, responsible, failur...",[The public is not solely responsible for clim...
124,123,11,123_ocean_temperatures_concern_heat,"[ocean, temperatures, concern, heat, rising, s...","[Ocean temperatures are a cause for concern, l..."


In [133]:
base_visual_docs = base_topic_model.visualize_documents(tokenized_text, embeddings=base_embed, hide_annotations=True, title='<b>Semantically Chunked Documents and Topics</b>')
base_visual_docs.write_html("figures/seg-visual-docs.html")
base_visual_docs

In [130]:
visual_docs = topic_model.visualize_documents(docs, embeddings=embed, hide_annotations=True, title='<b>LLM Extracted Documents and Topics</b>')
visual_docs.write_html("figures/llm-visual-docs.html")
visual_docs

In [131]:
# Visualize topics over time
extraction_timestamps = [[time]*lex for time,lex in zip(example_df['created_time'], example_df["extraction"].apply(len))]
times = [t for ts in extraction_timestamps for t in ts]

topics_over_time = topic_model.topics_over_time(docs, times, nr_bins=20)

In [132]:
llm_over_time_fig = topic_model.visualize_topics_over_time(
    topics_over_time,
    top_n_topics=30,
    width=1200,
    height=900,
    title='<b>LLM Extracted Topics over Time</b>'
)
llm_over_time_fig.write_html("figures/llm-timestamp-fig.html")
llm_over_time_fig