In [2]:
import os
from constants import *

os.environ["PPLX_API_KEY"] = PPLX_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY

In [3]:
from langchain_community.chat_models import ChatPerplexity
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.chains import OpenAIModerationChain
from langchain_core.prompts import ChatPromptTemplate

import asyncio
import pandas as pd
import numpy as np

In [4]:
import re
path = "data/reddit_opinion_climate_change.csv" # June 12
df = pd.read_csv(path)

In [5]:
df_deduplicated = df.drop_duplicates(subset='self_text', keep='first')
len(df_deduplicated)

533810

In [6]:
working_series = df_deduplicated[df['subreddit']=='climate']
working_series[:3]

  working_series = df_deduplicated[df['subreddit']=='climate']


Unnamed: 0,comment_id,score,self_text,subreddit,created_time,post_id,author_name,controversiality,ups,downs,...,user_link_karma,user_comment_karma,user_total_karma,post_score,post_self_text,post_title,post_upvote_ratio,post_thumbs_ups,post_total_awards_received,post_created_time
5,l8ahr59,1,They were inhaling those green house gases. Th...,climate,2024-06-12 16:34:03,1de2pfn,burkiniwax,0,1,0,...,7226.0,63681.0,78790.0,293,,Study finds human-caused nitrous oxide emissio...,1.0,293,0,2024-06-12 09:44:53
20,l8aesk6,1,"I certainly appreciate this take, but it becom...",climate,2024-06-12 16:17:47,1ddeq7j,Borthwick,0,1,0,...,2910.0,52278.0,55447.0,611,,Most Irish people underestimate the link betwe...,0.9,611,0,2024-06-11 13:56:02
25,l8aehzl,1,Where did I claim that I wanted DAC in the nea...,climate,2024-06-12 16:16:09,1ddgt4a,zypofaeser,0,1,0,...,549.0,61954.0,62906.0,108,,Nuclear Power Is Hard. A Climate-Minded Billio...,0.86,108,0,2024-06-11 15:23:20


In [7]:
# Text Length
print("Max:", np.max(working_series['self_text'].apply(len)))
print("Mean:", np.mean(working_series['self_text'].apply(len)))

Max: 8693
Mean: 245.08213765855686


In [8]:
import re 

def list_from_content(content, mstring):
    pattern = f"{mstring}(.*?){mstring}"
    match = re.search(pattern, content, re.DOTALL)

    if match:
        opinions_text = match.group().strip(mstring)
        opinions_text = opinions_text.strip("\n")

        opinions_list = [t.strip("\n") for t in opinions_text.split('- ')]
        opinions_list.pop(0)  # empty string
        
        return opinions_list
    else:
        raise Exception("Invalid LLM output!")


async def extract_statements(text_input, context, agent, mstring, max_attempts=3):
    
    opinion_list = []
    for i in range(max_attempts):
        try:
            content = agent.invoke({'input': text_input, "context": context, "mstring": mstring}).content
            opinion_list = list_from_content(content, mstring)
            break
        except Exception:
            continue
    
    if not opinion_list:
        print("Message extraction failed/empty!")

    return opinion_list

seg_human = """
Here are examples of segmentations.

Context: Climate Change.
Text Input: "Is the Climate Crisis a Population Problem? “The balance of population in the past was controlled by death: it was ugly and unacceptable. The new balance is controlled by love.” – Hans Rosling #GlobalCarbonFeeAndDividendPetition\n\nWe are reaching peak fossil fuels because we have replaced them with renewable energy not because it is running out.\n\nElectric vehicle use is rising fast not just cars, but electric buses, tuk-tuks and scooters. Even electric bicycles are great for transport.\n\nThere are solutions for manufacturing and it is full electrification and slapping solar panels on the roof.\n\nSteel, Chemical fertilizers, and fine chemicals can all be produced by green hydrogen. We need to build the green hydrogen industry and we can get it done.  \nThere is also the potential for the shipping industry to use green ammonia from this industry as well.\n\nI don't think you are up to date as to where our technology is right now."
Output: 
Here is my thought process: [...]
Therefore, I describe this person's perspective as follows.

{mstring}
- Fossil fuel usage is declining precisely because of rising renewable energy usage.
- Immense applications in various industries brought by green energy.
- Rapid rise in electric vehicle usage.
- Full electrification is a solution for manufacturing.
- Optimistic about production possibilities from green hydrogen. 
{mstring}


Can you extract opinions and perspectives from the following input? Let's reason it out.
Context: {context}
Text Input: {input}
"""

system = """
You are a neutral, objective bot. Your task is to identify central opinions and perspectives in the text input.
Analyse and understand the text input, then return a list encapsulated by marking strings ({mstring}). 
Please avoid repetition, and be as concise as possible.
"""

seg_prompt = ChatPromptTemplate.from_messages([("system", system), ("human", seg_human)])    


In [9]:
pplx_llm = ChatPerplexity(temperature=0.2, model="llama-3-70b-instruct")
openai_llm = ChatOpenAI(temperature=0.2, model="gpt-3.5-turbo")
pplx_small_llm = ChatPerplexity(temperature=0.2, model="llama-3-8b-instruct")

agent = seg_prompt | pplx_small_llm

In [10]:
# Testing the Prompt


id = 42

a = working_series["post_title"].iloc[id] + "\n\n" + working_series["self_text"].iloc[id]
print(a)
print("_" *20)

print(
    agent.invoke({'input': a, 'context': "climate change", "mstring": '###'}).content
)


Nuclear Power Is Hard. A Climate-Minded Billionaire Wants to Make It Easier. | The reactor being built by TerraPower, a start-up, won’t be finished until 2030 at the earliest

Let's entertain the notion that you can do this.

It is irrelevant to emissions cuts before 2040, let alone 2030. By the time your plan came off, we would be at 2C and on our way to 3C.
____________________
Here is my thought process:

The text input presents a critical perspective on nuclear power as a solution to climate change. The author seems to be skeptical about the effectiveness of nuclear power in reducing emissions, particularly in the short term.

Therefore, I describe this person's perspective as follows.

###
- Nuclear power is not a viable solution for emissions cuts before 2040 or 2030.
- The author is skeptical about the impact of nuclear power on reducing emissions.
- The author implies that nuclear power is not a timely solution to address climate change.
###


In [13]:
NUM_SAMPLES = 2000  # number of samples to extract from

example = pd.DataFrame(working_series).sample(NUM_SAMPLES, random_state=42)

full_text = example['post_title'] + "\n\n" + example['self_text']
full_text = full_text.to_list()
full_context = "Subreddit: " + example["subreddit"]
full_context = full_context.to_list()

mstring = "###"

---

In [14]:
### RUN ONLY IF NECESSARY; The expensive, time consuming extraction step.

extraction = await asyncio.gather(
    *[extract_statements(t, c, agent, mstring) for t,c in zip(full_text, full_context)]
)

print("Extraction Complete. Unsuccessful:", extraction.count([]))

Message extraction failed/empty!
Message extraction failed/empty!
Extraction Complete. Unsuccessful: 2


---

In [None]:
example["extraction"] = extraction
example.to_csv("climate.csv")