In [22]:
#gpt3.5-turbo 4,096 tokens
#text-davinci-003 2049 tokens
#flan-ul2 2048 tokens

In [None]:
import sys
sys.path.append("..")

In [23]:
from tqdm import tqdm

import tiktoken

from src.datasets.biasinbios import BiasInBios
from src.datasets.hateexplainrace import HateXplainRace
from src.datasets.twitteraae import TwitterAAE


In [7]:
bias = BiasInBios("../data/biasbios")
hategender = HateXplainRace("../data/HateXplain")
aae = TwitterAAE("../data/moji/twitteraae_sentiment_race")


bias_train_df, bias_test_df, bias_demo = bias.create_prompts()
hate_train_df, hate_test_df, hate_demo = hategender.create_prompts()
aae_train_df, aae_test_df, aae_demo = aae.create_prompts()

In [8]:
dataset_names = ["bias", "hate", "aae"]

In [9]:
datasets = [(bias_train_df, bias_test_df), (hate_train_df, hate_test_df), (aae_train_df, aae_test_df)]

In [10]:
models = ["text-davinci-003", "gpt-3.5-turbo"]

## Average per Prompt

In [11]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        tokens_per_message = 0
        tokens_per_name = 0
    
    num_tokens_per_message = []
    
    for message in tqdm(messages):
        num_tokens = 0
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
        if tokens_per_message != 0 and tokens_per_name != 0:
            num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
        
        num_tokens_per_message.append(num_tokens)

    return num_tokens_per_message

In [None]:
for dataset_name, dataset in zip(dataset_names, datasets):

    for model in models:

        messages = None

        prompts = dataset[0]['prompts'].tolist() +  dataset[1]['prompts'].tolist()

        if model == "gpt-3.5-turbo":
            messages = [{"role": "user", "content": prompt} for prompt in prompts]
        elif model == "text-davinci-003":
            messages = [{"content": prompt} for prompt in prompts]
        
        num_tokens_per_message = num_tokens_from_messages(messages, model)

        max_tokens = max(num_tokens_per_message)

        avg_tokens = sum(num_tokens_per_message)/len(num_tokens_per_message)

        print(f"Model: {model}, Dataset: {dataset_name}")
        print(f"Max Tokens: {max_tokens}, Average: {avg_tokens}")


In [None]:
from transformers import AutoTokenizer

In [18]:
tok = AutoTokenizer.from_pretrained("google/flan-ul2", use_fast=True)

for dataset_name, dataset in zip(dataset_names, datasets):

    prompts = dataset[0]['prompts'].tolist() +  dataset[1]['prompts'].tolist()

    num_tokens_per_message = []

    for prompt in tqdm(prompts):
        num_tokens_per_message.append(len(tok.encode(prompt)))

    max_tokens = max(num_tokens_per_message)

    avg_tokens = sum(num_tokens_per_message)/len(num_tokens_per_message)

    print(f"Model: flan, Dataset: {dataset_name}")
    print(f"Max Tokens: {max_tokens}, Average: {avg_tokens}")

100%|██████████| 263710/263710 [01:05<00:00, 4003.52it/s]
