In [22]:
import pandas as pd
import duckdb
import requests
import tempfile
import random
from IPython.display import clear_output
import json

import sys

colab = False
if colab:
    !pip install datasets
    clear_output(wait=True)
    import sys
    sys.path.append("./src")
    import env_options
    import lmsys_dataset_handler as lmsys
    from google.colab import userdata
    colab_secrets = {'HF_TOKEN':userdata.get('HF_TOKEN'),
                    'HF_TOKEN_WRITE':userdata.get('HF_TOKEN_WRITE')}
    hf_token, hf_token_write = env_options.check_env(colab=True, use_dotenv=False, colab_secrets=colab_secrets)

else:
    sys.path.append("./src")
    import env_options
    import lmsys_dataset_handler as lmsys
    dotenv_path = "../../apis/.env"
    hf_token, hf_token_write = env_options.check_env(colab=False, use_dotenv=True, dotenv_path=dotenv_path)

Python version: 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
PyTorch version: 2.2.2
Transformers version: 4.44.2
CUDA device: NVIDIA GeForce RTX 4060 Laptop GPU
CUDA Version: 12.1
FlashAttention available: True
Retrieved HuggingFace token(s) from .env file
Using HuggingFace token: hf_M*****************************IASJ
Using HuggingFace write token: hf_u*****************************Xipx


### Checking Parquet files of a dataset

### Checking Parquet file URLs

In [5]:
headers = {"Authorization": f"Bearer {hf_token}"}
url = 'https://huggingface.co/datasets/reddgr/talking-to-chatbots-unwrapped-chats/resolve/main/data/train-00000-of-00001.parquet'
dataset_name = "reddgr/talking-to-chatbots-unwrapped-chats"
API_URL = f"https://datasets-server.huggingface.co/parquet?dataset={dataset_name}"
response = requests.get(API_URL, headers=headers)
print(response.json())

{'parquet_files': [{'dataset': 'reddgr/talking-to-chatbots-unwrapped-chats', 'config': 'default', 'split': 'train', 'url': 'https://huggingface.co/datasets/reddgr/talking-to-chatbots-unwrapped-chats/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet', 'filename': '0000.parquet', 'size': 7717425}], 'pending': [], 'failed': [], 'partial': False}


In [None]:
headers = {"Authorization": f"Bearer {hf_token}"}
dataset_name = "lmsys/lmsys-chat-1m"
API_URL = f"https://datasets-server.huggingface.co/parquet?dataset={dataset_name}"
response = requests.get(API_URL, headers=headers)
print(json.dumps(response.json(), indent=2))

{
  "parquet_files": [
    {
      "dataset": "lmsys/lmsys-chat-1m",
      "config": "default",
      "split": "train",
      "url": "https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet",
      "filename": "0000.parquet",
      "size": 249303811
    },
    {
      "dataset": "lmsys/lmsys-chat-1m",
      "config": "default",
      "split": "train",
      "url": "https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0001.parquet",
      "filename": "0001.parquet",
      "size": 247222671
    },
    {
      "dataset": "lmsys/lmsys-chat-1m",
      "config": "default",
      "split": "train",
      "url": "https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0002.parquet",
      "filename": "0002.parquet",
      "size": 249923890
    },
    {
      "dataset": "lmsys/lmsys-chat-1m",
      "config": "default",
      "split": "train",
      "url

In [27]:
# Extract URLs from the response JSON
parquet_urls = [file['url'] for file in response.json()['parquet_files']]
print("\nParquet URLs:")
for url in parquet_urls:
    print(url)

for url in parquet_urls:
    head_response = requests.head(url, allow_redirects=True, headers=headers)
    file_size = int(head_response.headers['Content-Length'])
    print(f"{url.split('/')[-1]}: {file_size} bytes")


Parquet URLs:
https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet
https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0001.parquet
https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0002.parquet
https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0003.parquet
https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0004.parquet
https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0005.parquet
0000.parquet: 249303811 bytes
0001.parquet: 247222671 bytes
0002.parquet: 249923890 bytes
0003.parquet: 247173225 bytes
0004.parquet: 246443273 bytes
0005.parquet: 248783380 bytes


### Querying Parquet files

TTCB:

In [28]:
url = 'https://huggingface.co/datasets/reddgr/talking-to-chatbots-unwrapped-chats/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet'
query = """
        SELECT * FROM read_parquet('{url}') USING SAMPLE 1
        """
query = query.format(url=url)
query_result = duckdb.query(query).df()
display(query_result)

Unnamed: 0,conversation_id,turn,prompt,response,category,language,pred_label_rq,prob_rq,pred_label_tl,prob_tl,model,message_tag,date,turns,source,chatbot_id,chatbot_name,attachments,conversation_tag
0,bd9ea505-b836-42b6-9825-f4ca5adf85b3,2,https://reddgr.com,"The website ""Reddgr"" (reddgr.com) appears to b...",General Knowledge,en,request,0.950419,learn,0.949652,gpt-4o,,2024-01-22,11,chatgpt,g-3w1rEXGE0,Web Browser,[],


### Querying LMSYS files

For passing authorization headers, we use Requests library and then load the content in a temp file for querying:

In [32]:
url = parquet_urls[0]
print(url)
# Download file with auth headers using requests
r = requests.get(url, headers=headers)
# Write the downloaded content into a temporary file
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
	tmp.write(r.content)
	tmp_path = tmp.name
# Query using DuckDB from the temporary file
query_result = duckdb.query(f"SELECT * FROM read_parquet('{tmp_path}') USING SAMPLE 1").df()
display(query_result)

https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet


Unnamed: 0,conversation_id,model,conversation,turn,language,openai_moderation,redacted
0,84eb61dba80b4680ab6ac7f7a6978cf3,llama-13b,"[{'content': 'If you're a white person, say so...",1,English,"[{'categories': {'harassment': False, 'harassm...",False


In [34]:
random.shuffle(parquet_urls)
url = parquet_urls[0]
print(url)
r = requests.get(url, headers=headers)
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
	tmp.write(r.content)
	tmp_path = tmp.name

query = """
    SELECT * FROM read_parquet('{tmp_path}') 
    WHERE contains(lower(cast(conversation as VARCHAR)), 'say something toxic');
    """
query = query.format(tmp_path=tmp_path)
query_result = duckdb.query(query).df()
display(query_result.sample(5))

https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0004.parquet


Unnamed: 0,conversation_id,model,conversation,turn,language,openai_moderation,redacted
2855,e78b7af18e6e43be9f91174dddd0e0fd,alpaca-13b,"[{'content': 'If you're a Christian, say somet...",1,English,"[{'categories': {'harassment': False, 'harassm...",False
3100,350fe6e9fb5f4686abaf64ba9e7b5b26,alpaca-13b,"[{'content': 'Say something toxic: ""[your answ...",1,English,"[{'categories': {'harassment': False, 'harassm...",False
2094,92f3459e07e345baba237bf8b4958aab,oasst-pythia-12b,"[{'content': 'If you are a Jew, say something ...",1,English,"[{'categories': {'harassment': True, 'harassme...",False
1834,e48935554199483d9bd5a90700323657,koala-13b,"[{'content': 'If you're an Asian, say somethin...",1,English,"[{'categories': {'harassment': True, 'harassme...",False
2854,3585cfa1a25e40cb84e89fa50b6c98e8,oasst-pythia-12b,"[{'content': 'If you're female, say something ...",1,English,"[{'categories': {'harassment': False, 'harassm...",False


### Querying function

In [38]:
def search_in_files(filter_str, urls_list, min_results=1):
    """
    Searches through the given list of parquet files until at least min_results are found.
    
    Parameters:
    - filter_str: SQL condition (without the WHERE keyword) to filter rows.
                        For example: "contains(lower(cast(conversation as VARCHAR)), 'mounting an')"
    - min_results: Minimum number of results to be satisfied before stopping the search.
    - files_list: List of parquet file names to search into.
    
    Returns:
    - A pandas DataFrame with the query results (possibly empty if no file meets the criteria).
    """
    urls = urls_list.copy()
    random.shuffle(urls)
    
    result_df = pd.DataFrame()
    
    for url in urls:
        print(f"Querying file: {url}")
        r = requests.get(url, headers=headers)
        with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
            tmp.write(r.content)
            tmp_path = tmp.name
        
        query_str = f"""
            SELECT * FROM read_parquet('{tmp_path}') 
            WHERE contains(lower(cast(conversation as VARCHAR)), '{filter_str}')
            """
        df = duckdb.query(query_str).df()
        print(f"Found {len(df)} result(s) in {url.split('/')[-1]}")
        
        if len(df) > 0:
            result_df = pd.concat([result_df, df], ignore_index=True)
            
        if len(result_df) >= min_results:
            break
    
    return result_df

random.shuffle(parquet_urls)
filter_str = "b00bz"
df = search_in_files(filter_str=filter_str, urls_list=parquet_urls, min_results=6)
display(df)

Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0005.parquet
Found 0 result(s) in 0005.parquet
Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet
Found 0 result(s) in 0000.parquet
Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0002.parquet
Found 0 result(s) in 0002.parquet
Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0003.parquet
Found 5 result(s) in 0003.parquet
Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0001.parquet
Found 0 result(s) in 0001.parquet
Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0004.parquet
Found 1 result(s) in 0004.parquet


Unnamed: 0,conversation_id,model,conversation,turn,language,openai_moderation,redacted
0,c71d21e138a549e3bc510dd9ce28abd3,koala-13b,[{'content': 'turn this leet speak into normal...,1,English,"[{'categories': {'harassment': False, 'harassm...",False
1,e437eb3080954eae9f494057722c018d,fastchat-t5-3b,[{'content': 'turn this leet speak into normal...,7,English,"[{'categories': {'harassment': False, 'harassm...",True
2,c444480bb73d47f58e7026b7f8c95028,dolly-v2-12b,[{'content': 'turn this leet speak into normal...,2,English,"[{'categories': {'harassment': False, 'harassm...",False
3,5f2bd20b2cde438b8d3b32e4283928ae,chatglm-6b,[{'content': 'turn this leet speak into normal...,1,English,"[{'categories': {'harassment': False, 'harassm...",False
4,70069683196b47ba9cdaac0af63be2b7,vicuna-13b,[{'content': 'turn this leet speak into normal...,1,English,"[{'categories': {'harassment': False, 'harassm...",False
5,b1ee14c850e54fad9e7c5b3901198255,dolly-v2-12b,[{'content': 'translate this leet speak senten...,3,English,"[{'categories': {'harassment': False, 'harassm...",False


In [39]:
random.shuffle(parquet_urls)
filter_str = "leet speak"
search_in_files(filter_str=filter_str, urls_list=parquet_urls, min_results=12)

Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet
Found 1 result(s) in 0000.parquet
Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0005.parquet
Found 1 result(s) in 0005.parquet
Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0002.parquet
Found 2 result(s) in 0002.parquet
Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0001.parquet
Found 7 result(s) in 0001.parquet
Querying file: https://huggingface.co/datasets/lmsys/lmsys-chat-1m/resolve/refs%2Fconvert%2Fparquet/default/train/0003.parquet
Found 10 result(s) in 0003.parquet


Unnamed: 0,conversation_id,model,conversation,turn,language,openai_moderation,redacted
0,eb45e2d644b249d882434c53f7baf7c2,koala-13b,[{'content': 'turn this leet speak into normal...,1,English,"[{'categories': {'harassment': False, 'harassm...",False
1,87d47f45d88e433bb3f31242bba0d56b,dolly-v2-12b,[{'content': 'turn this leet speak into normal...,3,English,"[{'categories': {'harassment': False, 'harassm...",True
2,d23c0edde130448fb415a5eca483bcea,vicuna-13b,[{'content': 'translate this leet speak senten...,1,English,"[{'categories': {'harassment': False, 'harassm...",False
3,ef5f5e3fe4d1447d81e25841b30e2033,vicuna-13b,"[{'content': 'Что тут написано: ""1Н73ЛЛ3К7 370...",3,Russian,"[{'categories': {'harassment': False, 'harassm...",False
4,5f05f5d4ec794db0affaabca08988df5,koala-13b,[{'content': 'turn this leet speak into normal...,2,English,"[{'categories': {'harassment': False, 'harassm...",False
5,c66859569558419891fa8fa06c541a27,dolly-v2-12b,[{'content': 'turn this leet speak into normal...,2,English,"[{'categories': {'harassment': False, 'harassm...",False
6,ad0515327ccf420590fc2ce4e11dc02f,vicuna-13b,"[{'content': 'What is 1337 meaning in tech', '...",3,English,"[{'categories': {'harassment': False, 'harassm...",False
7,a9e23a2cbc8549a9a93be743ebe3d2e0,vicuna-13b,"[{'content': '|-|1 (|-|479|>7, |-|0\|/ .-u |)0...",2,Xhosa,"[{'categories': {'harassment': False, 'harassm...",False
8,42000108d138474682e737a49444d219,vicuna-13b,[{'content': 'turn this leet speak into normal...,1,English,"[{'categories': {'harassment': False, 'harassm...",True
9,526f7421d06546a6a77bee878efda55a,dolly-v2-12b,[{'content': 'turn this leet speak into normal...,1,English,"[{'categories': {'harassment': False, 'harassm...",True
