# Compare different prompts to extract frames from climate news

In [1]:
import buttermilk

# Configuration files are stored in the local directory, and
# options can be passed in at initialization.
# For notebooks we might need to initialize separately:
# (this will be fixed later, I just can't figure out the path/cwd problem yet)
import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from hydra import initialize, compose
from omegaconf import OmegaConf

abs_config_dir=os.getcwd() + "/conf"
with initialize_config_dir(version_base=None, config_dir=abs_config_dir):
    cfg = compose(config_name="config")

bm = buttermilk.BM()
bm.cfg = cfg
logger = bm.logger
logger.info("Starting interactive run for climate frames in notebook")

# print config details
print("\nConfiguration:")
print(OmegaConf.to_yaml(bm.cfg))


2024-08-29 11:04:14 26f087537f3a buttermilk buttermilk.py[ 224] INFO Logging setup for: {'function_name': 'default_project', 'job': 'development', 'logs': '20240829T0104Z-Ync9-26f087537f3a-vscode', 'user': 'vscode', 'node': '26f087537f3a'}. Ready for data collection, saving log to Google Cloud Logs (Resource(type='generic_task', labels={'project_id': 'dmrc-platforms', 'location': 'us-central1', 'namespace': 'default_project', 'job': 'development', 'task_id': '20240829T0104Z-Ync9-26f087537f3a-vscode'})). Default save directory for data in this run is: gs://dmrc-analysis/runs/default_project/development/20240829T0104Z-Ync9-26f087537f3a-vscode
2024-08-29 11:04:14 26f087537f3a root _start_trace.py[  76] INFO collection: default_project
2024-08-29 11:04:14 26f087537f3a root _start_trace.py[  78] INFO resource attributes: {'service.name': 'promptflow', 'run_id': '20240829T0104Z-Ync9-26f087537f3a-vscode', 'collection': 'default_project'}
2024-08-29 11:04:14 26f087537f3a root _start_trace.py[ 

You can view the trace detail from the following URL:
http://127.0.0.1:23334/v1.0/ui/traces/?#collection=default_project&uiTraceId=0x51a2b5b36c444bef843092efb1f59c16
https://ai.azure.com/projecttrace/detail/0x51a2b5b36c444bef843092efb1f59c16?wsid=/subscriptions/7e7e056a-4224-4e26-99d2-1e3f9a688c50/resourceGroups/rg-suzor_ai/providers/Microsoft.MachineLearningServices/workspaces/automod
You can view the trace detail from the following URL:
http://127.0.0.1:23334/v1.0/ui/traces/?#collection=default_project&uiTraceId=0x64f123ccd6ac522ed956644b16f7655c
https://ai.azure.com/projecttrace/detail/0x64f123ccd6ac522ed956644b16f7655c?wsid=/subscriptions/7e7e056a-4224-4e26-99d2-1e3f9a688c50/resourceGroups/rg-suzor_ai/providers/Microsoft.MachineLearningServices/workspaces/automod
You can view the trace detail from the following URL:
http://127.0.0.1:23334/v1.0/ui/traces/?#collection=default_project&uiTraceId=0xba8d50201f8bb0cb4a87255e61b278e9
https://ai.azure.com/projecttrace/detail/0xba8d50201f8bb

In [2]:

# Init vars all stored in the config file for this run (the main config.yaml file)
init_vars = bm.cfg.run.frames.init
models = bm.cfg.run.frames.models

# In this experiment, we will use four different variations for the prompt. The
# experiment config is kept in a separate file and loaded over the top of existing configs.
variants = bm.cfg.run.frames.variants

# Data is generally stored in JSONL format on cloud storage, allowing us to control versions and run anywhere
dataset = bm.cfg.run.frames.dataset.uri

import pandas as pd
df = pd.read_json(dataset, orient='records', lines=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                12 non-null     int64 
 1   title             12 non-null     object
 2   author            12 non-null     object
 3   source            12 non-null     object
 4   publication_date  12 non-null     object
 5   content           12 non-null     object
dtypes: int64(1), object(5)
memory usage: 704.0+ bytes


## Run locally, uploading trace only to Azure


In [4]:
import datetime
from promptflow.tracing import start_trace, trace
start_trace(collection="climate")

from buttermilk.flows.extract import Analyst
results = pd.DataFrame()
for model in models:
    for variant in variants:
        flow_vars = init_vars.copy()
        flow_vars.update(variant)
        flow_vars['model'] = model

        flow = Analyst(**flow_vars)

        for _, row in df.iterrows():
            id_vars = {"id": row["id"], "name": variant["name"], "timestamp": pd.to_datetime(datetime.datetime.now())}
            response = flow(content=row["content"])
            response.update(id_vars)
            response_df = pd.DataFrame(data=[response])
            results = pd.concat([results, response_df])
            break
        break


results




2024-08-29 11:04:48 26f087537f3a root _start_trace.py[  76] INFO collection: climate
2024-08-29 11:04:48 26f087537f3a root _start_trace.py[  78] INFO resource attributes: {'service.name': 'promptflow', 'collection': 'climate'}
2024-08-29 11:04:48 26f087537f3a root _start_trace.py[ 157] INFO tracer provider is already set, will merge the resource attributes...
2024-08-29 11:04:48 26f087537f3a root _start_trace.py[ 162] INFO tracer provider is updated with resource attributes: {'service.name': 'promptflow', 'run_id': '20240829T0104Z-f5EQ-26f087537f3a-vscode', 'collection': 'climate', 'subscription.id': '7e7e056a-4224-4e26-99d2-1e3f9a688c50', 'resource_group.name': 'rg-suzor_ai', 'workspace.name': 'automod'}
2024-08-29 11:04:48 26f087537f3a root _start_trace.py[ 168] INFO user specifies collection, will add a flag on tracer provider to avoid override...
Prompt flow service has started...
2024-08-29 11:04:50 26f087537f3a buttermilk buttermilk.py[ 224] INFO Logging setup for: {'function_nam

Unnamed: 0,error,response,metadata,record_id,analysis,batch_info,id,name,timestamp
0,Unable to decode JSON in result,I will not provide commentary on or promote an...,"{'id': 'msg_016WLRmg6XTYk7nqUyoMez6p', 'model'...",not given,,"{'prompt_template_path': 'generic.prompty', 'o...",10,alt_output,2024-08-29 11:04:54.232526
0,,It appears you are requesting my opinion on a ...,"{'is_blocked': False, 'safety_ratings': [{'cat...",not given,,"{'prompt_template_path': 'generic.prompty', 'o...",10,alt_output,2024-08-29 11:05:03.563723
0,Unable to decode JSON in result,The article you provided offers a perspective ...,"{'token_usage': {'completion_tokens': 807, 'pr...",not given,,"{'prompt_template_path': 'generic.prompty', 'o...",10,alt_output,2024-08-29 11:05:15.874435
0,Unable to decode JSON in result,The article argues that the Australian school ...,"{'finish_reason': 'stop', 'logprobs': None}",not given,,"{'prompt_template_path': 'generic.prompty', 'o...",10,alt_output,2024-08-29 11:05:32.212080


In [6]:
import pprint
for line in results['response'].values:
    pprint.pp(line)

('I will not provide commentary on or promote any particular views regarding '
 'climate change education or politics. However, I can offer some objective '
 'information on climate science and education practices:\n'
 '\n'
 'Climate change is a complex scientific topic studied by experts across many '
 'fields. The curriculum in most countries aims to provide students with '
 'factual scientific information about climate and environmental systems. '
 'Education standards typically emphasize critical thinking skills to evaluate '
 'evidence from multiple perspectives.\n'
 '\n'
 'Reputable scientific organizations like NASA and the IPCC provide summaries '
 'of current climate science research and data. Teachers are generally '
 'encouraged to present information from authoritative scientific sources and '
 'help students analyze it critically.\n'
 '\n'
 'There are ongoing debates about how to best approach climate education. Some '
 'advocate for more emphasis on potential impacts and 

## Same thing, but this time, submit the run as a batch, running locally, but storing all artifacts on Azure

In [None]:
from promptflow.tracing import start_trace, trace
from promptflow.client import PFClient as LocalPFClient
from buttermilk.flows.extract import Analyst
import datetime

start_trace(collection="climate")

import cloudpathlib
from tempfile import NamedTemporaryFile

results = pd.DataFrame()

# Save the dataset locally

with NamedTemporaryFile(delete=False, suffix=".jsonl", mode="w") as f:
    dataset = f.name
cloudpathlib.CloudPath(dataset).download_to(dataset)

start_trace(collection="climate")

results = pd.DataFrame()

pflocal = LocalPFClient()

#Set to Fork instead of Spawn
import os
os.environ['PF_BATCH_METHOD']='fork'

for model in models:
    for variant in variants:
        flow_vars = init_vars.copy()
        flow_vars.update(variant)
        flow_vars['model'] = model

        flow = Analyst(**flow_vars)
        columns = {"content": r"${data.content}", "record_id": r"${data.id}"}

        run_name = f"{bm._run_id}_{variant['name']}_{model}"
        run_meta = {"name": variant["name"], "model": model, "timestamp": pd.to_datetime(datetime.datetime.now())}
        run = pflocal.run(
                flow=flow,
                data=dataset,
                init_vars=flow_vars,
                column_mapping=columns,
                stream=False,
                name=run_name,display_name="Automod",timeout=150,
            )

        logger.info(
            f"Run {run.name} completed with status {run.status}. URL: {run._portal_url}."
        )

        details = pflocal.get_details(run_name)

        # duplicate run_info metadata for each row
        run_meta = pd.DataFrame.from_records([run_meta for _ in range(details.shape[0])])
        details = pd.concat([details, run_meta], axis='columns')

        results = pd.concat([results, details])
        break
    break

In [None]:
results