# Compare different prompts to extract frames from climate news

In [1]:
import buttermilk

bm = buttermilk.BM()
logger = bm.logger
logger.info("Starting interactive run for climate frames in notebook")

2024-08-26 17:33:47 26f087537f3a buttermilk buttermilk.py[ 200] INFO Logging setup for: {'function_name': 'default_project', 'job': 'development', 'logs': '20240826T0733Z-8tjN-26f087537f3a-vscode', 'user': 'vscode', 'node': '26f087537f3a'}. Ready for data collection, saving log to Google Cloud Logs (Resource(type='generic_task', labels={'project_id': 'dmrc-platforms', 'location': 'us-central1', 'namespace': 'default_project', 'job': 'development', 'task_id': '20240826T0733Z-8tjN-26f087537f3a-vscode'})). Default save directory for data in this run is: gs://dmrc-analysis/runs/default_project/development/20240826T0733Z-8tjN-26f087537f3a-vscode
2024-08-26 17:33:47 26f087537f3a buttermilk 1583369548.py[   5] INFO Starting interactive run for climate frames in notebook


In [6]:
# In this experiment, we will use four different variations for the prompt:
prompt_vars = {"prompt_template_path": "generic.prompty", "system_prompt": "system_frames.jinja2", "output_format": "json_frames.jinja2"}
variants = [{"name": "generic frames", "instructions": "instructions_frames.jinja2"},
            {"name": "speaker first alt", "instructions": "climate_activism_speakerfirst_alt_output.jinja2"},
            {"name": "speaker first", "instructions": "instructions_frames.jinja2", },
            {"name": "climate activism frames", "instructions": "climate_activism.jinja2"},
            ]

models = ["haiku", "llama31-8b"]

# Data is generally stored in JSONL format on cloud storage, allowing us to control versions and run anywhere
DATASET = "gs://dmrc-analysis/data/climate_articles.jsonl"

import pandas as pd
df = pd.read_json(DATASET, orient='records', lines=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                12 non-null     int64 
 1   title             12 non-null     object
 2   author            12 non-null     object
 3   source            12 non-null     object
 4   publication_date  12 non-null     object
 5   content           12 non-null     object
dtypes: int64(1), object(5)
memory usage: 704.0+ bytes


## Run locally, uploading trace only to Azure


In [3]:
import datetime
from promptflow.tracing import start_trace, trace
start_trace(collection="climate")

from buttermilk.flows.extract import Analyst

results = pd.DataFrame()

for model in models:
    for variant in variants:
        flow_vars = prompt_vars.copy()
        flow_vars.update(variant)
        flow_vars['langchain_model_name'] = model

        flow = Analyst(**flow_vars)

        for _, row in df.iterrows():
            id_vars = {"id": row["id"], "name": variant["name"], "model": model, "timestamp": pd.to_datetime(datetime.datetime.now())}
            response = flow(content=row["content"])
            response.update(id_vars)
            response_df = pd.DataFrame(data=[response])
            results = pd.concat([results, response_df])
            break
        break
    break


results




Prompt flow service has started...


  from .autonotebook import tqdm as notebook_tqdm


2024-08-26 17:33:58 26f087537f3a buttermilk buttermilk.py[ 200] INFO Logging setup for: {'function_name': 'default_project', 'job': 'development', 'logs': '20240826T0733Z-e83v-26f087537f3a-vscode', 'user': 'vscode', 'node': '26f087537f3a'}. Ready for data collection, saving log to Google Cloud Logs (Resource(type='generic_task', labels={'project_id': 'dmrc-platforms', 'location': 'us-central1', 'namespace': 'default_project', 'job': 'development', 'task_id': '20240826T0733Z-e83v-26f087537f3a-vscode'})). Default save directory for data in this run is: gs://dmrc-analysis/runs/default_project/development/20240826T0733Z-e83v-26f087537f3a-vscode


INFO:buttermilk:Logging setup for: {'function_name': 'default_project', 'job': 'development', 'logs': '20240826T0733Z-e83v-26f087537f3a-vscode', 'user': 'vscode', 'node': '26f087537f3a'}. Ready for data collection, saving log to Google Cloud Logs (Resource(type='generic_task', labels={'project_id': 'dmrc-platforms', 'location': 'us-central1', 'namespace': 'default_project', 'job': 'development', 'task_id': '20240826T0733Z-e83v-26f087537f3a-vscode'})). Default save directory for data in this run is: gs://dmrc-analysis/runs/default_project/development/20240826T0733Z-e83v-26f087537f3a-vscode


Unnamed: 0,analysis,metadata,record_id,scores,labels,reasons,result,id,name,model,timestamp
0,[{'statement': 'Why was climate change hysteri...,{'name': 'generic frames'},not given,,,,,10,generic frames,haiku,2024-08-26 17:34:00.246043


You can view the trace detail from the following URL:
http://127.0.0.1:23334/v1.0/ui/traces/?#collection=climate&uiTraceId=0x88032d150cbd28015d19888fb77d762c
https://ai.azure.com/projecttrace/detail/0x88032d150cbd28015d19888fb77d762c?wsid=/subscriptions/7e7e056a-4224-4e26-99d2-1e3f9a688c50/resourceGroups/rg-suzor_ai/providers/Microsoft.MachineLearningServices/workspaces/automod


In [4]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
for line in results['analysis'].values:
    pp.pprint(line)

[   {   'blame_attribution': 'What is taught in Australian schools',
        'cause_of_problem': 'What is taught in Australian schools',
        'problem_definition': 'Climate change hysteria contributing to '
                              'election results',
        'solution_adressee': 'None found',
        'solution_to_problem': 'None found',
        'speaker_affiliation': 'Journalist',
        'speaker_name': 'Author',
        'statement': 'Why was climate change hysteria one of the main reasons '
                     'the teal fake independents and the Greens did so well in '
                     'the recent election and the Morrison Coalition '
                     'government so soundly defeated?'},
    {   'blame_attribution': 'None found',
        'cause_of_problem': 'None found',
        'problem_definition': 'Increased focus on climate change in the '
                              'national curriculum',
        'solution_adressee': 'None found',
        'solution_to_problem'

# Same thing, but this time, submit the run as a batch, running locally, but storing all artifacts on Azure

In [9]:
from promptflow.tracing import start_trace, trace
from promptflow.client import PFClient as LocalPFClient
from buttermilk.flows.extract import Analyst
import datetime

start_trace(collection="climate")

import cloudpathlib
from tempfile import NamedTemporaryFile

results = pd.DataFrame()

# Save the dataset locally

with NamedTemporaryFile(delete=False, suffix=".jsonl", mode="w") as f:
    dataset = f.name
cloudpathlib.CloudPath(DATASET).download_to(dataset)

start_trace(collection="climate")

results = pd.DataFrame()

pflocal = LocalPFClient()

for model in models:
    for variant in variants:
        flow_vars = prompt_vars.copy()
        flow_vars.update(variant)
        flow_vars['langchain_model_name'] = model

        flow = Analyst(**flow_vars)
        columns = {"content": "content", "record_id": id}
        run_name = f"{bm._run_id}_{variant['name']}_{model}"
        run_meta = {"name": variant["name"], "model": model, "timestamp": pd.to_datetime(datetime.datetime.now())}
        run = pflocal.run(
                flow=flow,
                data=dataset,
                init_vars=flow_vars,
                column_mapping=columns,
                stream=False,
                name=run_name,display_name="Automod",timeout=150,
            )


        logger.info(
            f"Run {run.name} completed with status {run.status}. URL: {run._portal_url}."
        )

        details = pflocal.get_details(run_name)

        # duplicate run_info metadata for each row
        run_meta = pd.DataFrame.from_records([run_meta for _ in range(details.shape[0])])
        details = pd.concat([details, run_meta], axis='columns')

        results = pd.concat([results, details])
        break
    break

Prompt flow service has started...
Prompt flow service has started...
2024-08-26 17:53:15 26f087537f3a buttermilk buttermilk.py[ 200] INFO Logging setup for: {'function_name': 'default_project', 'job': 'development', 'logs': '20240826T0753Z-b5Hv-26f087537f3a-vscode', 'user': 'vscode', 'node': '26f087537f3a'}. Ready for data collection, saving log to Google Cloud Logs (Resource(type='generic_task', labels={'project_id': 'dmrc-platforms', 'location': 'us-central1', 'namespace': 'default_project', 'job': 'development', 'task_id': '20240826T0753Z-b5Hv-26f087537f3a-vscode'})). Default save directory for data in this run is: gs://dmrc-analysis/runs/default_project/development/20240826T0753Z-b5Hv-26f087537f3a-vscode
2024-08-26 17:53:15 26f087537f3a buttermilk buttermilk.py[ 200] INFO Logging setup for: {'function_name': 'default_project', 'job': 'development', 'logs': '20240826T0753Z-b5Hv-26f087537f3a-vscode', 'user': 'vscode', 'node': '26f087537f3a'}. Ready for data collection, saving log to

INFO:buttermilk:Logging setup for: {'function_name': 'default_project', 'job': 'development', 'logs': '20240826T0753Z-b5Hv-26f087537f3a-vscode', 'user': 'vscode', 'node': '26f087537f3a'}. Ready for data collection, saving log to Google Cloud Logs (Resource(type='generic_task', labels={'project_id': 'dmrc-platforms', 'location': 'us-central1', 'namespace': 'default_project', 'job': 'development', 'task_id': '20240826T0753Z-b5Hv-26f087537f3a-vscode'})). Default save directory for data in this run is: gs://dmrc-analysis/runs/default_project/development/20240826T0753Z-b5Hv-26f087537f3a-vscode
[2024-08-26 17:53:16 +1000][promptflow._sdk._orchestrator.run_submitter][INFO] - Upload run to cloud: True


Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23334/v1.0/ui/traces/?#run=20240826T0733Z-8tjN-26f087537f3a-vscode_generic frames_haiku
You can view the traces in azure portal since trace destination is set to: azureml://subscriptions/7e7e056a-4224-4e26-99d2-1e3f9a688c50/resourcegroups/rg-suzor_ai/providers/Microsoft.MachineLearningServices/workspaces/automod. The link will be printed once the run is finished.


[2024-08-26 17:53:18 +1000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run 20240826T0733Z-8tjN-26f087537f3a-vscode_generic frames_haiku, log path: /home/vscode/.promptflow/.runs/20240826T0733Z-8tjN-26f087537f3a-vscode_generic frames_haiku/logs.txt


'20240826T0733Z-8tjN-26f087537f3a-vscode'