In [1]:
from datasets import load_dataset, Dataset
import numpy as np

%load_ext autoreload
%autoreload 2

In [2]:
# Formatting Cell
response_template = "Chatbot:<EOP_TOKEN>"
def create_formatted_prompt(row, idx):
    # Assuming completion data
    formatted_prompt = f"""User: {row['summary']}\nChatbot:<EOP_TOKEN> {row['title']}"""
    return {"formatted_prompt": formatted_prompt, "id": idx}

In [81]:
dataset_size = 100

ds = load_dataset("billsum")
ds = ds.remove_columns('text')
# Add ids
ds = ds.map(create_formatted_prompt, with_indices=True)
ds_train = Dataset.from_dict(ds['train'][:dataset_size])
ds_train

Found cached dataset billsum (/Users/jonathangomesselman/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/jonathangomesselman/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc/cache-7894bddaec6a0cce.arrow
Loading cached processed dataset at /Users/jonathangomesselman/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc/cache-117f8b6ca6eccf9b.arrow
Loading cached processed dataset at /Users/jonathangomesselman/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc/cache-89a2f792fc523b98.arrow


Dataset({
    features: ['summary', 'title', 'formatted_prompt', 'id'],
    num_rows: 100
})

In [24]:
from transformers import PreTrainedTokenizerFast, GenerationConfig, AutoModelForCausalLM
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained("Cohere/command-nightly")
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
# Fake the model
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")

In [75]:
import asyncio
from typing import Optional, Dict, Union

from cohere import AsyncClient
from cohere.responses import Generations, StreamingGenerations

a_co = AsyncClient('...')

async def co_generate(
    prompt: Optional[str] = None,
    prompt_vars: object = {},
    model: Optional[str] = None,
    preset: Optional[str] = None,
    num_generations: Optional[int] = None,
    max_tokens: Optional[int] = None,
    temperature: Optional[float] = None,
    k: Optional[int] = None,
    p: Optional[float] = None,
    frequency_penalty: Optional[float] = None,
    presence_penalty: Optional[float] = None,
    end_sequences: Optional[List[str]] = None,
    stop_sequences: Optional[List[str]] = None,
    return_likelihoods: Optional[str] = None,
    truncate: Optional[str] = None,
    logit_bias: Dict[int, float] = {},
    raw_prompting: bool = False,
) -> Dict:
    """
    Overwrites `AsyncClient.generate` to we can use the internal `raw_prompting` argument.
    TODO: revert back to using `AsyncClient.generate` once `raw_prompting` is added to the SDK.
    """
    json_body = {
        "model": model,
        "prompt": prompt,
        "prompt_vars": prompt_vars,
        "preset": preset,
        "num_generations": num_generations,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "k": k,
        "p": p,
        "frequency_penalty": frequency_penalty,
        "presence_penalty": presence_penalty,
        "end_sequences": end_sequences,
        "stop_sequences": stop_sequences,
        "return_likelihoods": return_likelihoods,
        "truncate": truncate,
        "logit_bias": logit_bias,
        "stream": False,
        "raw_prompting": raw_prompting,
    }
    response = await a_co._request(cohere.GENERATE_URL, json=json_body, stream=False)
    return response

In [78]:
async def a_query_batch(prompts: List[str]) -> torch.Tensor:
    response_jobs = []
    for prompt in prompts:
        # For now append <BOS_TOKEN> and <EOS_TOKEN> NOTE DQ TOKENIZER THINKS IT IS EOP but whatever
        prompt = f"""<BOS_TOKEN>{prompt}<EOS_TOKEN>"""
        response_job = co_generate(
            prompt = prompt,
            return_likelihoods = "ALL",
            raw_prompting = True,
            max_tokens = 0
        )
        response_jobs.append(response_job)

    responses = await asyncio.gather(*response_jobs) 
    logprob_responses = []
    for response in responses:
        logprobs = [token['likelihood'] for token in response['generations'][0]['token_likelihoods']]
        logprob_responses.append(torch.Tensor(logprobs))
    
    # Pad to the max sequence length in the batch
    logprob_responses = torch.nn.utils.rnn.pad_sequence(logprob_responses, batch_first=True)
    return logprob_responses

In [80]:
import os
os.environ['GALILEO_CONSOLE_URL']="https://console.dev.rungalileo.io"
os.environ["GALILEO_USERNAME"]="galileo@rungalileo.io"
os.environ["GALILEO_PASSWORD"]="..."

import dataquality as dq
from dataquality.integrations.seq2seq.hf import watch
dq.configure()



📡 https://console.dev.rungalileo.io
🔭 Logging you into Galileo

🚀 You're logged in to Galileo as galileo@rungalileo.io!


In [82]:
dq.init("seq2seq", project_name="Seq2Seq_DecoderOnly_Cohere")

temperature = 0.4
generation_config = GenerationConfig(
    max_new_tokens=15,
    # Whether we use multinomial sampling
    do_sample=temperature >= 1e-5,
    temperature=temperature,
)

watch(
    model,
    fast_tokenizer,
    generation_config,
    generation_splits=[],
    max_input_tokens=1024, # Prompt + Completion
    response_template=response_template
)

✨ Initializing existing public project 'Seq2Seq_DecoderOnly_Cohere'
🏃‍♂️ Creating new run '2023-11-14_4'
🛰 Connected to existing project 'Seq2Seq_DecoderOnly_Cohere', and new run '2023-11-14_4'.


  warn(


In [83]:
def log_dataset(ds, input_col="summary", target_col="title", formatted_prompt="formatted_prompt"):
    dq.log_dataset(
        ds,
        text=input_col,
        label=target_col,
        formatted_prompt=formatted_prompt,
        split="training"
    )

# Log just for training
log_dataset(ds_train)

Logging 100 samples [########################################] 100.00% elapsed time  :     0.00s =  0.0m =  0.0h
 

In [86]:
from time import time
import torch

batch_size = 10

async def log_model_outputs(ds):
    for i in range(0, len(ds), batch_size):
        print (f"Processing batch {i // batch_size}")
        batch = ds[i: i + batch_size]
        batch_ids = batch['id']
        batch_model_inputs = batch['formatted_prompt']
        
        print ("Calling up Cohere...")
        logprobs = await a_query_batch(batch_model_inputs)
        print ("DONE!")
        print()
        
        dq.log_model_outputs(
            probs = logprobs,
            ids = batch_ids,
            embs = ... # Shape [bs, emb_dim]
        )

dq.set_epoch(0)
dq.set_split("train")
await log_model_outputs(ds_train)

Processing batch 0
Calling up Cohere...
DONE!

Processing batch 1
Calling up Cohere...
DONE!

Processing batch 2
Calling up Cohere...
DONE!

Processing batch 3
Calling up Cohere...
DONE!

Processing batch 4
Calling up Cohere...
DONE!

Processing batch 5
Calling up Cohere...
DONE!

Processing batch 6
Calling up Cohere...
DONE!

Processing batch 7
Calling up Cohere...
DONE!

Processing batch 8
Calling up Cohere...
DONE!

Processing batch 9
Calling up Cohere...
DONE!



In [87]:
dq.finish()

☁️ Uploading Data
CuML libraries not found, running standard process. For faster Galileo processing, consider installing
`pip install 'dataquality[cuda]' --extra-index-url=https://pypi.nvidia.com/`


training:   0%|          | 0/1 [00:00<?, ?it/s]

Skipping generation for split training


training (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

Uploading data to Galileo:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/348k [00:00<?, ?B/s]

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x157cfeac0>


Job default successfully submitted. Results will be available soon at https://console.dev.rungalileo.io/insights/9cf4f816-5f3f-4036-8f27-290d00681111/08cb0ed1-cfd9-4543-89f5-374585d8a781?split=training&taskType=8
Waiting for job (you can safely close this window)...
	No embs found, skipping processing
	No data embs found, skipping processing
Done! Job finished with status completed
🧹 Cleaning up
🧹 Cleaning up


'https://console.dev.rungalileo.io/insights/9cf4f816-5f3f-4036-8f27-290d00681111/08cb0ed1-cfd9-4543-89f5-374585d8a781?split=training&taskType=8'

In [None]:
# Chat data format
<BOS_TOKEN>User: t1
Chatbot: c1
User: t2
Chatbot: t2
User: t3
Chatbot:<EOP_TOKEN> t3<EOS_TOKEN>


[
    ["User", "text"],
    ["Chatbot", "text"],
    ...
] -->
[
    "User: text\nChatbot:", "text",
    "User: text\nChatbot: text\nUser: text2\nChatbot:", "text2",
]

In [None]:
# All less than 4k in training data