In [None]:
from datasets import load_dataset, Dataset
import numpy as np

%load_ext autoreload
%autoreload 2

In [None]:
# Formatting Cell
response_template = "Chatbot:<EOP_TOKEN>"
def create_formatted_prompt(row, idx):
    # Assuming completion data
    formatted_prompt = f"""User: {row['summary']}\nChatbot:<EOP_TOKEN> {row['title']}"""
    return {"formatted_prompt": formatted_prompt, "id": idx}

In [None]:
dataset_size = 100

ds = load_dataset("billsum")
ds = ds.remove_columns('text')
# Add ids
ds = ds.map(create_formatted_prompt, with_indices=True)
ds_train = Dataset.from_dict(ds['train'][:dataset_size])
ds_train

In [None]:
from transformers import PreTrainedTokenizerFast, GenerationConfig, AutoModelForCausalLM
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
# Fake the model
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")

In [None]:
import asyncio
from typing import Optional, Dict, Union, List

import cohere
from cohere import AsyncClient
from cohere.responses import Generations, StreamingGenerations

a_co = AsyncClient('...')

async def co_generate(
    prompt: Optional[str] = None,
    prompt_vars: object = {},
    model: Optional[str] = None,
    preset: Optional[str] = None,
    num_generations: Optional[int] = None,
    max_tokens: Optional[int] = None,
    temperature: Optional[float] = None,
    k: Optional[int] = None,
    p: Optional[float] = None,
    frequency_penalty: Optional[float] = None,
    presence_penalty: Optional[float] = None,
    end_sequences: Optional[List[str]] = None,
    stop_sequences: Optional[List[str]] = None,
    return_likelihoods: Optional[str] = None,
    truncate: Optional[str] = None,
    logit_bias: Dict[int, float] = {},
    raw_prompting: bool = False,
) -> Dict:
    """
    Overwrites `AsyncClient.generate` to we can use the internal `raw_prompting` argument.
    TODO: revert back to using `AsyncClient.generate` once `raw_prompting` is added to the SDK.
    """
    json_body = {
        "model": model,
        "prompt": prompt,
        "prompt_vars": prompt_vars,
        "preset": preset,
        "num_generations": num_generations,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "k": k,
        "p": p,
        "frequency_penalty": frequency_penalty,
        "presence_penalty": presence_penalty,
        "end_sequences": end_sequences,
        "stop_sequences": stop_sequences,
        "return_likelihoods": return_likelihoods,
        "truncate": truncate,
        "logit_bias": logit_bias,
        "stream": False,
        "raw_prompting": raw_prompting,
    }
    response = await a_co._request(cohere.GENERATE_URL, json=json_body, stream=False)
    return response

In [None]:
import torch 

async def a_query_batch(prompts: List[str]) -> torch.Tensor:
    response_jobs = []
    for prompt in prompts:
        # For now append <BOS_TOKEN> and <EOS_TOKEN> NOTE DQ TOKENIZER THINKS IT IS EOP but whatever
        prompt = f"""<BOS_TOKEN>{prompt}<EOS_TOKEN>"""
        response_job = co_generate(
            prompt = prompt,
            return_likelihoods = "ALL",
            raw_prompting = True,
            max_tokens = 0
        )
        response_jobs.append(response_job)

    responses = await asyncio.gather(*response_jobs) 
    logprob_responses = []
    for response in responses:
        logprobs = [token['likelihood'] for token in response['generations'][0]['token_likelihoods']]
        logprob_responses.append(torch.Tensor(logprobs))
    
    # Pad to the max sequence length in the batch
    logprob_responses = torch.nn.utils.rnn.pad_sequence(logprob_responses, batch_first=True)
    return logprob_responses

In [None]:
import os
os.environ['GALILEO_CONSOLE_URL']="https://console.dev.rungalileo.io"
os.environ["GALILEO_USERNAME"]="galileo@rungalileo.io"
os.environ["GALILEO_PASSWORD"]="..."

import dataquality as dq
from dataquality.integrations.seq2seq.hf import watch
dq.configure()

In [None]:
dq.init("seq2seq", project_name="Seq2Seq_DecoderOnly_Cohere")

temperature = 0.4
generation_config = GenerationConfig(
    max_new_tokens=15,
    # Whether we use multinomial sampling
    do_sample=temperature >= 1e-5,
    temperature=temperature,
)

response_template_ids = tokenizer.encode(response_template, add_special_tokens=False).ids

watch(
    model,
    model_type="decoder_only",
    tokenizer=tokenizer,
    generation_config=generation_config,
    generation_splits=[],
    max_input_tokens=1024, # Prompt + Completion
    response_template=response_template_ids
)

In [None]:
def log_dataset(ds, input_col="summary", target_col="title", formatted_prompt="formatted_prompt"):
    dq.log_dataset(
        ds,
        text=input_col,
        label=target_col,
        formatted_prompt=formatted_prompt,
        split="training"
    )

# Log just for training
log_dataset(ds_train)

In [None]:
from time import time
import torch

batch_size = 10

async def log_model_outputs(ds):
    for i in range(0, len(ds), batch_size):
        print (f"Processing batch {i // batch_size}")
        batch = ds[i: i + batch_size]
        batch_ids = batch['id']
        batch_model_inputs = batch['formatted_prompt']
        
        print ("Calling up Cohere...")
        logprobs = await a_query_batch(batch_model_inputs)
        print ("DONE!")
        print()
        
        dq.log_model_outputs(
            probs = logprobs,
            ids = batch_ids,
        )

dq.set_epoch(0)
dq.set_split("train")
await log_model_outputs(ds_train)

In [None]:
dq.finish()