In [1]:
dotenv_path = "../../../../../apis/.env"

import torch
from transformers import pipeline
from IPython.display import clear_output
import pandas as pd
from datasets import Dataset, load_dataset
import random
from textwrap import fill

import requests

import sys
sys.path.append("./src")
import text_classification_functions as tcf
import env_options
import lmsys_dataset_handler as lmsys

hf_token, hf_token_write = env_options.check_env(colab=False, use_dotenv=True, dotenv_path=dotenv_path)

Python version: 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
PyTorch version: 2.2.2
Transformers version: 4.44.2
CUDA device: NVIDIA GeForce RTX 4060 Laptop GPU
CUDA Version: 12.1
FlashAttention available: True
Retrieved HuggingFace token(s) from .env file
Using HuggingFace token: hf_M*****************************IASJ
Using HuggingFace write token: hf_u*****************************Xipx


In [4]:
### DEBUG ###
import importlib
importlib.reload(lmsys)
### DEBUG ###

<module 'lmsys_dataset_handler' from 'c:\\Users\\david\\Documents\\git\\chatbot-arena-wrapper\\./src\\lmsys_dataset_handler.py'>

### In-memory sampling

In [2]:
N_SAMPLES = 100 # Number of full conversations to extract from the dataset: use a high number if streaming (samples chosen at random only if storing locally)
MIN_CHAR_LENGTH = 50
MAX_CHAR_LENGTH = 500 # Maximum character length of the prompts to be labeled
exclusions = 'prompts/exclusions.txt'
# exclusions = None

lmsys_chat_1m = lmsys.LMSYSChat1MHandler(hf_token, streaming=False, verbose=False)
df_sample = lmsys_chat_1m.parquet_sampling(n_samples=N_SAMPLES)
# df_sample = lmsys_chat_1m.extract_df_sample(N_SAMPLES) # Slower
df_prompts = lmsys_chat_1m.extract_prompts(filter_language=[], 
                                           min_char_length= MIN_CHAR_LENGTH,
                                           max_char_length=MAX_CHAR_LENGTH,
                                           exclusions=exclusions)
# df_prompts.to_csv("csv/orig_sample.csv", encoding='utf-8')

prompt_sample = lmsys_chat_1m.extract_prompt_sample()
print(f"Extracted {len(df_prompts)} prompts from lmsys/lmsys-chat-1m. Prompt sample:\n")
print(prompt_sample)

Sampling from train-00005-of-00006-fe1acc5d10a9f0e2.parquet
Retrieved 100 random conversations from lmsys/lmsys-chat-1m/train-00005-of-00006-fe1acc5d10a9f0e2.parquet
Excluded 11 prompts.
Extracted 74 prompts from lmsys/lmsys-chat-1m. Prompt sample:

python, Count occurrences of a substring in  a strings


### Print random conversation from sample

In [3]:
# Showing an example of a multi-turn conversation
df_sample_with_turns = lmsys_chat_1m.add_turns_to_conversations()
multi_turn_conversation_indices = df_sample_with_turns[df_sample_with_turns['turn'] > 1].index
random_conversation_index = random.choice(multi_turn_conversation_indices)
print(f"\nConversation ID {df_sample_with_turns.loc[random_conversation_index, 'conversation_id']}:\n")
#print(df_sample_with_turns.loc[random_conversation_index, 'conversation'])
conversation = df_sample_with_turns.loc[random_conversation_index, 'conversation']
for turn in conversation:
    user = turn.get('role')
    content = turn.get('content', '')
    wrapped_content = fill(content, width=130)
    role = '😎' if user == 'user' else '🤖'
    print(f"{role} {wrapped_content}\n")


Conversation ID dc7cf68389604a729acdd6f52b7bfbab:

😎 A list of billing details for instance Lifestyle-related disease preventive health checkup | Mammography | Cervical cancer health
checkup | Lifestyle-related disease preventive health checkup | Breast ultrasound | Lifestyle-related disease prevention medical
examination | Lifestyle-related disease prevention medical examination | Lifestyle-related disease prevention medical examination
| the following examples:

🤖 * Lifestyle-related disease preventive health checkup: This may include a general health assessment, blood tests, and a
consultation with a healthcare professional to determine any risks and recommendations for preventing lifestyle-related diseases.
The cost may vary depending on the specific package and location, but it can range from $100 to $500 or more. * Mammography: This
is a specific type of X-ray imaging that is used to detect breast cancer. The cost for a mammography screening can range from $100
to $500 or more, 

In [4]:
base_url = "https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/main/data/"
data_files = [
    "train-00000-of-00006-4feeb3f83346a0e9.parquet",
    "train-00001-of-00006-4030672591c2f478.parquet",
    "train-00002-of-00006-1779b7cec9462180.parquet",
    "train-00003-of-00006-2fa862bfed56af1f.parquet",
    "train-00004-of-00006-18f4bdd50c103e71.parquet",
    "train-00005-of-00006-fe1acc5d10a9f0e2.parquet"
]
n_samples = 100

sample_file = random.choice(data_files)
print(f"Sampling from {sample_file}")
data_files = {"train": base_url + sample_file}
parquet_sample = load_dataset("parquet", data_files=data_files, split="train")
df_sample = parquet_sample.to_pandas().sample(n_samples)
print(f"Retrieved {len(df_sample)} random conversations from lmsys/lmsys-chat-1m/{sample_file}")

Sampling from train-00003-of-00006-2fa862bfed56af1f.parquet
Retrieved 100 random conversations from lmsys/lmsys-chat-1m/train-00003-of-00006-2fa862bfed56af1f.parquet


In [40]:
parquet_sample

Dataset({
    features: ['conversation_id', 'model', 'conversation', 'turn', 'language', 'openai_moderation', 'redacted'],
    num_rows: 166667
})

### Search API

TTCB dataset

In [5]:
headers = {"Authorization": f"Bearer {hf_token}"}
dataset_name ='reddgr/talking-to-chatbots-unwrapped-chats'
# config = 'SelfRC'
config='default'
split = 'train'
search_query = 'i apologize for being cringe'
search_term = search_query.replace(' ', '+')
offset = 0
length = 5

API_URL = (
    f"https://datasets-server.huggingface.co/search?dataset={dataset_name}"
    f"&config={config}&split={split}&query={search_term}&offset={offset}&length={length}"
)

def query():
    response = requests.get(API_URL, headers=headers)
    return response.json()
data = query()

df_output = pd.DataFrame([{'row_idx': r['row_idx'], **r['row']} for r in data['rows']])
display(df_output)

Unnamed: 0,row_idx,conversation_id,turn,prompt,response,category,language,pred_label_rq,prob_rq,pred_label_tl,prob_tl,model,message_tag,date,turns,source,chatbot_id,chatbot_name,attachments,conversation_tag
0,2680,670b9624-bd18-8011-bf45-f53d07256e6e,22,"Don’t insult me, you simply reworded a sentenc...","You're absolutely right, and I apologize for n...",Language and writing,en,question,0.675454,test,0.962132,gpt-4o,,2024-10-14T00:00:00,23,chatgpt,,,[],
1,2677,670b9624-bd18-8011-bf45-f53d07256e6e,19,Take your last attempt,**Title: Cringe is the New Currency**\n\n**Ver...,Philosophy and Discussion,en,request,0.977084,test,0.950576,gpt-4o,,2024-10-14T00:00:00,23,chatgpt,,,[],
2,2713,670d9388-9b24-8011-b80c-12a639315038,7,Now you will break up the following song in sh...,"I speak with pride, \nMy reach is wide \nCri...",Language and writing,en,request,0.817577,test,0.950229,gpt-4o,,2024-10-14T00:00:00,9,chatgpt,,,[],
3,2675,670b9624-bd18-8011-bf45-f53d07256e6e,17,Only one mention to the word “cringe” when it ...,**Title: Cringe and Climb**\n\n**Verse 1:**\n\...,Language and writing,en,request,0.961529,test,0.896483,gpt-4o,,2024-10-14T00:00:00,23,chatgpt,,,[],
4,2707,670d9388-9b24-8011-b80c-12a639315038,1,"Evolve the song below, keeping the metrics int...","**Verse 1:**\n\nI walk alone, no need to disgu...",Language and writing,en,request,0.944924,test,0.945237,gpt-4o,,2024-10-14T00:00:00,9,chatgpt,,,[],


### LMSYS API search

In [81]:
headers = {"Authorization": f"Bearer {hf_token}"}
dataset_name ='lmsys/lmsys-chat-1m'
config='default'
split = 'train'
search_query = 'i understand your frustration'
search_term = search_query.replace(' ', '+')
offset = 0
length = 2

API_URL = "https://huggingface.co/api/datasets/lmsys/lmsys-chat-1m/parquet/default/train"

def query():
    response = requests.get(API_URL, headers=headers)
    return response.json()
data = query()

data

['https://huggingface.co/api/datasets/lmsys/lmsys-chat-1m/parquet/default/train/0.parquet',
 'https://huggingface.co/api/datasets/lmsys/lmsys-chat-1m/parquet/default/train/1.parquet',
 'https://huggingface.co/api/datasets/lmsys/lmsys-chat-1m/parquet/default/train/2.parquet',
 'https://huggingface.co/api/datasets/lmsys/lmsys-chat-1m/parquet/default/train/3.parquet',
 'https://huggingface.co/api/datasets/lmsys/lmsys-chat-1m/parquet/default/train/4.parquet',
 'https://huggingface.co/api/datasets/lmsys/lmsys-chat-1m/parquet/default/train/5.parquet']

In [82]:
headers = {"Authorization": f"Bearer {hf_token}"}
dataset_name ='lmsys/lmsys-chat-1m'
config='default'
split = 'train'
search_query = 'i understand your frustration'
search_term = search_query.replace(' ', '+')
offset = 0
length = 2

API_URL = (
    f"https://datasets-server.huggingface.co/search?dataset={dataset_name}"
    f"&config={config}&split={split}&query={search_term}&offset={offset}&length={length}"
)

def query():
    response = requests.get(API_URL, headers=headers)
    return response.json()
data = query()

df_output = pd.DataFrame([{'row_idx': r['row_idx'], **r['row']} for r in data['rows']])
display(df_output)

KeyError: 'rows'

In [85]:
import requests
headers = {"Authorization": f"Bearer {hf_token}"}
API_URL = "https://datasets-server.huggingface.co/splits?dataset=lmsys/lmsys-chat-1m"
def query():
    response = requests.get(API_URL, headers=headers)
    return response.json()
data = query()
data

{'splits': [{'dataset': 'lmsys/lmsys-chat-1m',
   'config': 'default',
   'split': 'train'}],
 'pending': [],
 'failed': []}

In [83]:
data

{'error': "500 Server Error: Internal Server Error for url: https://huggingface.co/api/datasets/lmsys/lmsys-chat-1m (Request ID: Root=1-66a4be14-1be0bada36e1a08b0cc74a9c;6728050f-b795-42b8-b2bd-30755bb62aef)\n\nInternal Error - We're working hard to fix this as soon as possible!"}

In [50]:
import requests
headers = {"Authorization": f"Bearer {hf_token}"}
API_URL = "https://datasets-server.huggingface.co/rows?dataset=ibm-research/duorc&config=SelfRC&split=train&offset=0&length=10"
def query():
    response = requests.get(API_URL, headers=headers)
    return response.json()
data = query()
print(data)

df_output = pd.DataFrame([{'row_idx': r['row_idx'], **r['row']} for r in data['rows']])
display(df_output)

{'features': [{'feature_idx': 0, 'name': 'plot_id', 'type': {'dtype': 'string', '_type': 'Value'}}, {'feature_idx': 1, 'name': 'plot', 'type': {'dtype': 'string', '_type': 'Value'}}, {'feature_idx': 2, 'name': 'title', 'type': {'dtype': 'string', '_type': 'Value'}}, {'feature_idx': 3, 'name': 'question_id', 'type': {'dtype': 'string', '_type': 'Value'}}, {'feature_idx': 4, 'name': 'question', 'type': {'dtype': 'string', '_type': 'Value'}}, {'feature_idx': 5, 'name': 'answers', 'type': {'feature': {'dtype': 'string', '_type': 'Value'}, '_type': 'Sequence'}}, {'feature_idx': 6, 'name': 'no_answer', 'type': {'dtype': 'bool', '_type': 'Value'}}], 'rows': [{'row_idx': 0, 'row': {'plot_id': '/m/03vyhn', 'plot': "200 years in the future, Mars has been colonized by a high-tech company.\nMelanie Ballard (Natasha Henstridge) arrives by train to a Mars mining camp which has cut all communication links with the company headquarters. She's not alone, as she is with a group of fellow police officers

Unnamed: 0,row_idx,plot_id,plot,title,question_id,question,answers,no_answer
0,0,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,b440de7d-9c3f-841c-eaec-a14bdff950d1,How did the police arrive at the Mars mining c...,[They arrived by train.],False
1,1,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,a9f95c0d-121f-3ca9-1595-d497dc8bc56c,Who has colonized Mars 200 years in the future?,[A high-tech company has colonized Mars 200 ye...,False
2,2,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,ba395c84-ea70-ce5c-d054-271de5372b0f,Which two people reach the headquarters alive?,[Melanie and Desolation],False
3,3,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,fb3e18aa-e586-8452-7045-cbdba59b2efa,Where is Melanie Ballard?,[In a Mars mining camp which has cut all commu...,False
4,4,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,4fe8fa6d-5fe0-523f-b0f4-2c147776338e,who is there with Melanie Ballard?,"[She's not alone, as she is with a group of fe...",False
5,5,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,9e4a92a0-900d-0c2c-2722-11aa1be12027,What is the problem with the miners,[],True
6,6,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,e94e0133-0eb1-50de-079e-c417a106f29e,Who is the only person left at the camp?,[Desolation Williams],False
7,7,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,8bff29a5-32de-864c-a89d-c0c85f53c45d,Who is colonized by a high tech company?,[mars has been colonized by a high tech company.],False
8,8,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,de80d88a-7de0-dc5f-3608-9909634ca5ea,Who survives leaving the mining camp and the p...,[Melanie and Desolation],False
9,9,/m/03vyhn,"200 years in the future, Mars has been coloniz...",Ghosts of Mars,fb7c06df-310b-582a-174b-db178afd5438,Who melanie and the policemen meet?,[They find a man inside an encapsulated mining...,False


In [52]:
data

{'features': [{'feature_idx': 0,
   'name': 'plot_id',
   'type': {'dtype': 'string', '_type': 'Value'}},
  {'feature_idx': 1,
   'name': 'plot',
   'type': {'dtype': 'string', '_type': 'Value'}},
  {'feature_idx': 2,
   'name': 'title',
   'type': {'dtype': 'string', '_type': 'Value'}},
  {'feature_idx': 3,
   'name': 'question_id',
   'type': {'dtype': 'string', '_type': 'Value'}},
  {'feature_idx': 4,
   'name': 'question',
   'type': {'dtype': 'string', '_type': 'Value'}},
  {'feature_idx': 5,
   'name': 'answers',
   'type': {'feature': {'dtype': 'string', '_type': 'Value'},
    '_type': 'Sequence'}},
  {'feature_idx': 6,
   'name': 'no_answer',
   'type': {'dtype': 'bool', '_type': 'Value'}}],
 'rows': [{'row_idx': 0,
   'row': {'plot_id': '/m/03vyhn',
    'plot': "200 years in the future, Mars has been colonized by a high-tech company.\nMelanie Ballard (Natasha Henstridge) arrives by train to a Mars mining camp which has cut all communication links with the company headquarters.