<a href="https://colab.research.google.com/github/rsr2425/word-count-investigation/blob/main/notebooks/4_Chaining_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
CURRENT_MODEL_ID = "gpt-3.5-turbo"
LLM_JUDGE_MODEL_ID = "gpt-4o"
PROJECT_NAME = "word-count-investigation"

In [None]:
!pip install datasets langchain_openai rouge-score evaluate wandb deepeval



In [None]:
import os
from google.colab import userdata

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [None]:
import evaluate

rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("ccdv/cnn_dailymail", '3.0.0', split="test[:1000]")
dataset = dataset.rename_column('article', 'text')
dataset = dataset.rename_column('highlights', 'summary')
dataset = dataset.remove_columns(['id'])

README.md:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

cnn_dailymail.py:   0%|          | 0.00/9.27k [00:00<?, ?B/s]

The repository for ccdv/cnn_dailymail contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ccdv/cnn_dailymail.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


cnn_stories.tgz:   0%|          | 0.00/159M [00:00<?, ?B/s]

dailymail_stories.tgz:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.43M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['text', 'summary'],
    num_rows: 1000
})

## Metrics

In [None]:
import json

def compute_rouge(record, **kargs):
    return rouge_score.compute(
        predictions=[record['ai_summary']],
        references=[record['summary']]
    )

In [None]:
from json import JSONDecodeError

def generate_questions(text, llm, n):
    messages = [
      ("system", """
        You are a helpful question generating chatbot.  Generate {n} factual questions
        from the text provided by the user. Make sure these questions can be answered
        using the provided text, and that the answers should be yes or no. Make sure there
        are both questions that can be answered with yes and questions that can be answered
        with no. Think through step by step before answering and make sure there are a mix
        of answers to the questions you provide.

        Return the questions as a json containing a list of strings.
        """
      ),
      ("human", f"{text}"),
    ]
    ai_msg = llm.invoke(messages)
    questions = []
    try:
        questions = json.loads(ai_msg.content)['questions']
    except JSONDecodeError as e:
        questions = {'questions': [''] * n}
    return questions

def generate_anwsers(questions, source_text, llm):
    messages = [
      ("system", """
        You are a helpful question answering chatbot.  The user will give you a list of questions and the text off which you
        should answer them. Answer the questions using the provided text. Answer only with "Yes", "No", or "idk". If the
        question cannot be answered using the provided text, answer with "idk". If you are unsure, answer with "idk".
        If the question string is empty, answer with "idk".

        Return the answers as a json containing a list of strings.
        """
      ),
      ("human", f"""
        Please answer the following questions:

          {questions}

        using this text:

          {source_text}
      """),
    ]
    ai_msg = llm.invoke(messages)
    answers = []
    try:
        answers = json.loads(ai_msg.content)['answers']
    except (JSONDecodeError, TypeError) as e:
        answers = ['idk'] * len(questions)
    return answers

def compute_factual_consistency(record, llm, n):
    # TODO figure out why n isn't always respected
    questions = generate_questions(record['text'], llm, n)
    gt_answers = generate_anwsers(questions, record['text'], llm)
    # assert len(gt_answers) == n
    human_summary_answers = generate_anwsers(questions, record['summary'], llm)
    # assert len(human_summary_answers) == n
    ai_summary_answers = generate_anwsers(questions, record['ai_summary'], llm)
    # assert len(ai_summary_answers) == n

    if all(x == 'idk' for x in human_summary_answers):
        hfc = 0
    else:
        hfc = sum([1 if x == y else 0 for x, y in zip(human_summary_answers, gt_answers)]) / float(len(questions))
    if all(x == 'idk' for x in ai_summary_answers):
        afc = 0
    else:
        afc = sum([1 if x == y else 0 for x, y in zip(ai_summary_answers, gt_answers)]) / float(len(questions))

    return {
        'gt_answers': gt_answers,
        'human_summary_answers': human_summary_answers,
        'ai_summary_answers': ai_summary_answers,
        'human_factual_consistency': hfc,
        'ai_factual_consistency': afc,
    }

In [None]:
import enum

class Metric(enum.Enum):
    ROUGE = "ROUGE"
    FACTUAL_CONSISTENCY = "Factual Consistency"

    def __str__(self):
        return self.value

metric_fn_mapping = {
    Metric.ROUGE: compute_rouge,
    Metric.FACTUAL_CONSISTENCY: compute_factual_consistency,
}

## Helper Functions

In [None]:
from langchain_core.runnables import Runnable, RunnableConfig
from langchain_openai import ChatOpenAI
from typing import Any, Dict

class WordCountControlRunnable(Runnable):
    def __init__(
        self,
        llm: ChatOpenAI,
        word_count_target: int = 25,
        tolerance: int = 10,
        revision_attempts: int = 5,
    ):
        self.llm = llm
        self.word_count_target = word_count_target
        self.tolerance = tolerance
        self.revision_attempts = revision_attempts

    def invoke(
        self,
        input: Any,
        config: RunnableConfig = None,
        **kwargs: Any,
    ) -> Any:
        # Extract the raw text from input
        sample_text = input if isinstance(input, str) else input.get("sample_text")

        # Prepare the initial messages
        # TODO is it bad I'm asking this exact thing twice essentially?
        messages = [
            ("system", "You are a helpful summary chatbot. Summarize the content provided by the user."),
            ("human", sample_text),
        ]

        attempt = 0
        ai_summary = None

        # Iterative refinement loop
        while attempt < self.revision_attempts and (
            ai_summary is None or self._count_words(ai_summary) > self.word_count_target + self.tolerance
        ):
            attempt += 1
            ai_msg = self.llm.invoke(messages)
            ai_summary = ai_msg.content
            messages.append(("ai", ai_summary))
            messages.append(("human", "Shorten this."))

        return {"final_summary": ai_summary, "attempts": attempt}

    def _count_words(self, text: str) -> int:
        """Utility function to count words in a given text."""
        return len(text.split())

In [None]:
from langchain_core.output_parsers import StrOutputParser

TOLERANCE = 10
REVISION_ATTEMPTS = 5

def count_words(text):
    return len(text.split())

def summarize(record, llm, word_count_target=None, otherinstructions=None):
    messages = [
        ("system", """
        You are a helpful summary chatbot.  Summarize the content provided by the user. {otherinstructions}
        """),
        ("human", f"{record['text']}"),
    ]
    # TODO should I do some error handling here?
    summarizer = WordCountControlRunnable(
        llm=llm,
        word_count_target=WORD_COUNT_TARGET,
        tolerance=TOLERANCE,
        revision_attempts=REVISION_ATTEMPTS,
    )
    chain = llm | StrOutputParser() | summarizer
    output = chain.invoke(messages)
    ai_summary = output['final_summary']
    attempts = output['attempts']

    return {
        'text_word_count': count_words(record['text']),
        'summary_word_count': count_words(record['summary']),
        'ai_summary': ai_summary,
        'ai_summary_word_count': count_words(ai_summary),
        # need to add 1 for initial call in chain
        'total_model_calls': attempts + 1,
    }

In [None]:
def process_dataset(dataset, llm, n, metrics, word_count_target=None, otherinstructions=None):
    print("Processing Dataset!")
    print("Now summarizing data...")
    processed_dataset = dataset.map(
        summarize,
        fn_kwargs={
            'llm': llm,
            'word_count_target': word_count_target,
            'otherinstructions': otherinstructions
        }
    )
    # llm_judge = ChatOpenAI(model_name=LLM_JUDGE_MODEL_ID, temperature=temperature)
    for metric in metrics:
        print(f"Now calculating {str(metric)}...")
        processed_dataset = processed_dataset.map(
            metric_fn_mapping[metric],
            fn_kwargs={
                'llm': llm,
                # 'llm': llm_judge,
                'n': n
            }
        )
    print("Done!")
    return processed_dataset

In [None]:
import wandb

def log_dataset_to_wandb(dataset, project_name, run_name, split_name="dataset_split"):
    wandb.init(
        project=project_name,
        name=run_name,
        settings=wandb.Settings(_service_wait=300),
    )

    data_table = wandb.Table(columns=dataset.column_names)

    # Add rows from the dataset
    for row in dataset:
        data_table.add_data(*[row[col] for col in dataset.column_names])

    # Log the table to WandB
    wandb.log({split_name: data_table})

    wandb.finish()

In [None]:
def gen_run_name():
    pass

In [ ]:
from experiments import run_experiment, Metric

# Experiments

In [None]:
# Parameters across runs
SUBSET_SIZE = 100 # if set to None, entire dataset will be processed
TEMPERATURE = 0.7
NUMBER_OF_QUESTIONS = 10
LOG_TO_WANDB = False

metrics = [
    Metric.ROUGE,
    # Metric.FACTUAL_CONSISTENCY,
]

## Run: Baseline

In [None]:
# # Run Parameters
# WORD_COUNT_TARGET = None
# RUN_PREFIX=f"baseline_"

# results = run_experiment(
#     CURRENT_MODEL_ID,
#     TEMPERATURE,
#     dataset,
#     NUMBER_OF_QUESTIONS,
#     metrics,
#     word_count_target=WORD_COUNT_TARGET,
#     subset_size=SUBSET_SIZE,
#     log_to_wandb=LOG_TO_WANDB,
# )
# df = results.to_pandas()
# df.select_dtypes(include='number').mean()

## Run: Generate with Target Word Count(25)

In [ ]:
# Run Parameters
WORD_COUNT_TARGET = 25
RUN_PREFIX = f"word_cnt_target_{WORD_COUNT_TARGET}_"

results = run_experiment(
    CURRENT_MODEL_ID,
    TEMPERATURE,
    dataset,
    NUMBER_OF_QUESTIONS,
    metrics,
    word_count_target=WORD_COUNT_TARGET,
    subset_size=SUBSET_SIZE,
    log_to_wandb=LOG_TO_WANDB,
    project_name=PROJECT_NAME,
    run_prefix=RUN_PREFIX,
)
df = results.to_pandas()
df.select_dtypes(include='number').mean()

## Run: Generate with Target Word Count(50)

In [None]:
WORD_COUNT_TARGET = 50
RUN_PREFIX=f"word_cnt_target_{WORD_COUNT_TARGET}_"

results = run_experiment(
    CURRENT_MODEL_ID,
    TEMPERATURE,
    dataset,
    NUMBER_OF_QUESTIONS,
    metrics,
    word_count_target=WORD_COUNT_TARGET,
    subset_size=SUBSET_SIZE,
    log_to_wandb=LOG_TO_WANDB,
)
df = results.to_pandas()
df.select_dtypes(include='number').mean()

Processing Dataset!
Now summarizing data...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Now calculating ROUGE...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Done!


Unnamed: 0,0
text_word_count,693.52
summary_word_count,35.55
ai_summary_word_count,54.7
total_model_calls,3.97
rouge1,0.287199
rouge2,0.085015
rougeL,0.19591
rougeLsum,0.233106


## Run: Generate with Target Word Count(150)

In [None]:
WORD_COUNT_TARGET = 150
RUN_PREFIX=f"word_cnt_target_{WORD_COUNT_TARGET}_"

results = run_experiment(
    CURRENT_MODEL_ID,
    TEMPERATURE,
    dataset,
    NUMBER_OF_QUESTIONS,
    metrics,
    word_count_target=WORD_COUNT_TARGET,
    subset_size=SUBSET_SIZE,
    log_to_wandb=LOG_TO_WANDB,
)
df = results.to_pandas()
df.select_dtypes(include='number').mean()

Processing Dataset!
Now summarizing data...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Now calculating ROUGE...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Done!


Unnamed: 0,0
text_word_count,693.52
summary_word_count,35.55
ai_summary_word_count,100.01
total_model_calls,2.0
rouge1,0.246127
rouge2,0.081009
rougeL,0.164475
rougeLsum,0.200492
