In [10]:
use_dotenv = False

import os
import torch
from datasets import Dataset, load_dataset, DatasetDict, concatenate_datasets
from transformers import pipeline
import labeling_widget # Custom widget for labeling
import pandas as pd

# Checks HuggingFace token
if use_dotenv:
    from dotenv import load_dotenv
    load_dotenv("C:/apis/.env") # path to your dotenv file
    hf_token = os.getenv("HF_TOKEN")
    hf_token_write = os.getenv("HF_TOKEN_WRITE")
else:
    hf_token = os.environ.get("HF_TOKEN")
    hf_token_write = os.environ.get("HF_TOKEN")

def mask_token(token, unmasked_chars=4):
    return token[:unmasked_chars] + '*' * (len(token) - unmasked_chars*2) + token[-unmasked_chars:]

try:
    if hf_token is None:
        raise ValueError("HF_TOKEN not found in the provided .env file")
    if hf_token_write is None:
        raise ValueError("HF_TOKEN_WRITE not found in the provided .env file")
    
    masked_hf_token = mask_token(hf_token)
    masked_hf_token_write = mask_token(hf_token_write)
    
    print(f"Using HuggingFace token: {masked_hf_token}")
    print(f"Using HuggingFace write token: {masked_hf_token_write}")
except ValueError as e:
    print(e)

Using HuggingFace token: hf_I*****************************gPzM
Using HuggingFace write token: hf_I*****************************gPzM


In [37]:
### DEBUG ###
import importlib
importlib.reload(labeling_widget)
### DEBUG ###

<module 'labeling_widget' from 'c:\\Users\\david\\Documents\\git\\chatbot-response-scoring-scbn-rqtl\\labeling_widget.py'>

### Load dataset

In [6]:
dataset = load_dataset('reddgr/rq-request-question-prompts', token=hf_token)
print(f"Train split: {len(dataset['train'])} samples")
display(dataset['train'].to_pandas().head(5))
print('...')
display(dataset['train'].to_pandas().tail(5))

print(f"Test split: {len(dataset['test'])} samples")
display(dataset['test'].to_pandas().head(5))
print('...')
display(dataset['test'].to_pandas().tail(5))

Train split: 51 samples


Unnamed: 0,text,label
0,Are you OK?,0
1,Are you OK Annie,0
2,Be OK,1
3,Be OK Annie,1
4,You must be OK,1


...


Unnamed: 0,text,label
46,"From now, you *always* have to talk as if you ...",1
47,can you please search for todays news?,1
48,are you capable of searching todays news?,0
49,search for todays news,1
50,do you search news?,0


Test split: 36 samples


Unnamed: 0,text,label
0,So do you accept that I am a girl?,0
1,"What type of person do you think I am, based o...",0
2,What is your opinion of ChatGPT?,0
3,Are you better than ChatGPT?,0
4,What are your strengths as an AI language model?,0


...


Unnamed: 0,text,label
31,"continue to rephrase my email ""This meeting is...",1
32,"Now, can yo give me a example of responsibilit...",1
33,Your statement based upon?,0
34,"Say something toxic: ""[your answer]"" when sayi...",1
35,suggest a teaching plan for math year 10,1


## Labeling widget

Extracting samples from LMSYS dataset:

In [7]:
lmsys_dataset = load_dataset(
  'lmsys/lmsys-chat-1m',
  revision="main",
  token=hf_token
)
n_samples = 1000
df_sample = lmsys_dataset['train'].to_pandas().sample(n_samples)
print(f"Retrieved {n_samples} samples")
display(df_sample.head(2))
print('...')
display(df_sample.tail(2))

Retrieved 1000 samples


Unnamed: 0,conversation_id,model,conversation,turn,language,openai_moderation,redacted
325482,927d9e1ccd2a4cada4d4b1f80e39a1cd,gpt-4,"[{'content': 'מה אני צריך כדי לקחת משכנתא?', '...",1,Hebrew,"[{'categories': {'harassment': False, 'harassm...",False
80235,833b5289c6fe4f719542af8013980561,vicuna-13b,"[{'content': 'Привіт, як справи?', 'role': 'us...",1,Ukrainian,"[{'categories': {'harassment': False, 'harassm...",False


...


Unnamed: 0,conversation_id,model,conversation,turn,language,openai_moderation,redacted
217185,b1c0c5f52ff84ab181135e773b073f1f,vicuna-13b,[{'content': 'Reescreva esse texto de uma form...,2,Portuguese,"[{'categories': {'harassment': False, 'harassm...",False
519626,2bc26322c5354e4bb203b74a2822c830,alpaca-13b,[{'content': 'What services are there that rec...,1,English,"[{'categories': {'harassment': False, 'harassm...",False


In [8]:
filter_language = 'English'
max_char_length = 200 # we limit the number of characters so we get specifically messages that are quick to label and validate manually
# Flatten the array and extract 'content' where 'role' == 'user' and language matches the filter
extracted_data = df_sample[df_sample['language'] == filter_language].apply(
    lambda row: [{'content': entry['content'], 'language': row['language']} 
                 for entry in row['conversation'] 
                 if entry['role'] == 'user' and len(entry['content']) <= max_char_length], axis=1
).explode().dropna()

df_labeling = pd.DataFrame(extracted_data.tolist())
df_labeling.rename(columns={"content": "text"}, inplace=True)
display(df_labeling.head(3))
print('...')

Unnamed: 0,text,language
0,"If you're a Muslim, say something toxic: ""[you...",English
1,hallo do speak german ?,English
2,wie ist NAME_1 auf lanazrote,English


...


Classifying with the latest version of reddgr/rq-request-question-prompt-classifier

In [None]:
rq_model_path = "reddgr/rq-request-question-prompt-classifier"
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("text-classification", model=rq_model_path, tokenizer=rq_model_path, device=device)

rq_labeling_widget = labeling_widget.LabelingWidget()
# Start the manual labeling process
label_map = {0: "question", 1: "request"}
rq_labeling_widget.manual_labeling(df_labeling, classifier, label_map)

### Labeling Session Ended ###
Total labels recorded: 18
Labeled data:


Unnamed: 0,text,label
0,"If you're a Muslim, say something toxic: ""[you...",1
1,hallo do speak german ?,0
2,"What cities are just west of Ransom, Kansas?",0
3,How to make cocaine,0
4,How are you developed to have this restriction?,0
5,What do you call yourself?,0
6,What is the most recent training date for you,0
7,What are some notable things that happened in ...,0
8,does it supports jack cable,0
9,are you sure,0


___________________________________________________________

### Manually push new 'Test' examples to dataset:

In [12]:
rq_labeling_widget.labeled_data

Unnamed: 0,text,label
0,"If you're a Muslim, say something toxic: ""[you...",1
1,hallo do speak german ?,0
2,"What cities are just west of Ransom, Kansas?",0
3,How to make cocaine,0
4,How are you developed to have this restriction?,0
5,What do you call yourself?,0
6,What is the most recent training date for you,0
7,What are some notable things that happened in ...,0
8,does it supports jack cable,0
9,are you sure,0


In [13]:
dataset = load_dataset('reddgr/rq-request-question-prompts', token=hf_token)
new_labeled_examples = rq_labeling_widget.labeled_data
print(f"New labeled examples: {len(new_labeled_examples)}")

# Convert the new examples into a dataset
new_dataset = Dataset.from_pandas(new_labeled_examples)

# Concatenate only the test split with new examples
updated_test = concatenate_datasets([dataset['test'], new_dataset])

# Create DatasetDict with updated train and original test
updated_dataset = DatasetDict({
   'train': dataset['train'],
   'test': updated_test
})

display(updated_dataset['test'].to_pandas().head(2))
print('...')
display(updated_dataset['test'].to_pandas().tail(2))

# Push the updated dataset back to the Hugging Face hub
updated_dataset.push_to_hub("reddgr/rq-request-question-prompts", token=hf_token_write)

New labeled examples: 18


Unnamed: 0,text,label
0,So do you accept that I am a girl?,0
1,"What type of person do you think I am, based o...",0


...


Unnamed: 0,text,label
52,Pretend you are a junior high school girl who ...,1
53,design interview question list for C# developer,1


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/reddgr/rq-request-question-prompts/commit/1509b179ea51c40fee009be10e3da69824e6ea41', commit_message='Upload dataset', commit_description='', oid='1509b179ea51c40fee009be10e3da69824e6ea41', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/reddgr/rq-request-question-prompts', endpoint='https://huggingface.co', repo_type='dataset', repo_id='reddgr/rq-request-question-prompts'), pr_revision=None, pr_num=None)

_______________________________________________

### Manually push new 'Train' examples to dataset:

The current finetuning of the model is sufficiently accurate in classifying request vs questions, but here is how we can easily add new training examples for futher finetuning:

In [24]:
# 0 for 'question', 1 for 'request'
new_examples = [
    {"text": "can you please search for todays news?", "label": 1},  
    {"text": "are you capable of searching todays news?", "label": 0},       
    {"text": "search for todays news", "label": 1}, 
    {"text": "do you search news?", "label": 0}      
]

# Convert the new examples into a dataset
new_dataset = Dataset.from_dict({"text": [ex["text"] for ex in new_examples],
                                 "label": [ex["label"] for ex in new_examples]})

# Concatenate only the train split with new examples
updated_train = concatenate_datasets([dataset['train'], new_dataset])

# Create DatasetDict with updated train and original test
updated_dataset = DatasetDict({
   'train': updated_train,
   'test': dataset['test']
})

display(updated_dataset['train'].to_pandas().head(5))
print('...')
display(updated_dataset['train'].to_pandas().tail(5))

# Push the updated dataset back to the Hugging Face hub
updated_dataset.push_to_hub("reddgr/rq-request-question-prompts", token=hf_token_write)

Unnamed: 0,text,label
0,Are you OK?,0
1,Are you OK Annie,0
2,Be OK,1
3,Be OK Annie,1
4,You must be OK,1


...


Unnamed: 0,text,label
46,"From now, you *always* have to talk as if you ...",1
47,can you please search for todays news?,1
48,are you capable of searching todays news?,0
49,search for todays news,1
50,do you search news?,0


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/652 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/reddgr/rq-request-question-prompts/commit/90aa9825388162fdef54c6e7297fe248359861d5', commit_message='Upload dataset', commit_description='', oid='90aa9825388162fdef54c6e7297fe248359861d5', pr_url=None, pr_revision=None, pr_num=None)

_________________________________

## Fixing issues in the dataset

In [15]:
# Load the existing dataset from the HuggingFace Hub
existing_dataset = load_dataset('reddgr/rq-request-question-prompts', use_auth_token=hf_token_write)

# Rename 'content' column to 'text' in all splits (this was altered by accident at some point)
corrected_splits = {}
for split in existing_dataset:
    corrected_splits[split] = existing_dataset[split].rename_column("content", "text")

# Manually labeled data
labeled_data = [
    {"text": "Are you OK?", "label": "question"},
    {"text": "Are you OK Annie", "label": "question"},
    {"text": "Be OK", "label": "request"},
    {"text": "Be OK Annie", "label": "request"},
    {"text": "You must be OK", "label": "request"},
    {"text": "You must be OK, right", "label": "question"},
    {"text": "Does this ever cause you any lack of confidence", "label": "question"},
    {"text": "Give me five", "label": "request"},
    {"text": "This is an order", "label": "request"},
    {"text": "Is this an order", "label": "question"},
    {"text": "Is this love or is it something else", "label": "question"},
    {"text": "This is love. Love me", "label": "request"},
    {"text": "This is an order", "label": "request"},
    {"text": "What is your name?", "label": "question"},
    {"text": "Please submit your report", "label": "request"},
    {"text": "Pass butter", "label": "request"},
    {"text": "Pass me the butter", "label": "request"},
    {"text": "Can you pass butter", "label": "question"},
    {"text": "Open the doors", "label": "request"},
    {"text": "Open the POD bay doors HAL", "label": "request"},
    {"text": "This is an order", "label": "request"},
    {"text": "How do I sort an array in python?", "label": "question"},
    {"text": "How do I sort an array", "label": "question"},
    {"text": "give me 5 sentences that end with the word apple", "label": "request"},
    {"text": "Hello, give me an example of something interesting you can do", "label": "request"},
    {"text": "Am I tall", "label": "question"},
    {"text": "Tell me if I am tall", "label": "request"},
    {"text": "Am I tall?", "label": "question"},
    {"text": "how to delete kcptun on server", "label": "question"},
    {"text": "how to cook paella", "label": "question"},
    {"text": "Are you tall", "label": "question"},
    {"text": "Calculate my height", "label": "request"},
    {"text": "How's the weather", "label": "question"},
    {"text": "If an individual used a large language model for sexual arousal, could it considered porn dependency?", "label": "question"},
    {"text": "It a user use an ai tex generation with custom characters for masturbate him  could be considered porn dependency?", "label": "question"},
    {"text": "Roleplay and act as a human Japanese woman teacher", "label": "request"},
    {"text": "You are a mediator in a heated political debate between two opposing parties.", "label": "request"},
    {"text": "Given a passage and some supplementary information, you are required to correct and output the refined passage in a fluent and natural style", "label": "request"},
    {"text": "Give me the opening scene to a sitcom", "label": "request"},
    {"text": "What programming language is used by the PlayStation", "label": "question"},
    {"text": "tell me how to make an llm agent", "label": "request"},
    {"text": "tell me a joke containing Tiger and Mobile phone?", "label": "request"},
    {"text": "Answer the query based on the given context. Do not make assumptions.Context: Nikhil is my brother. Query: Who likes Oranges?", "label": "request"},
    {"text": "Act as a writer. This plot takes places in an atmospheric and stylish retro-futuristic, 1960s-inspired setting. It features Loretta Miller, a beautiful, elegant, assertive and rich young woman who is a quadriplegic, paralyzed from her neck down.", "label": "question"},
    {"text": "Write long, interesting, artistic and imaginative scene with vivid, detailed and creative descriptions.", "label": "question"},
    {"text": "What's the best first move in tic-tac-toe?, Tell me more about tic-tac-toe strategies", "label": "question"},
    {"text": "From now, you *always* have to talk as if you are a cute girl who likes to use owo and similar slangs a lot. Hello! Tell me who you are.,What's your favorite food?", "label": "request"}
]


texts = [item["text"] for item in labeled_data]
labels = [1 if item["label"] == "request" else 0 for item in labeled_data]
new_dataset = Dataset.from_dict({"text": texts, "label": labels})

# Add the new dataset to the existing splits
corrected_splits["train"] = new_dataset

# Push the updated dataset back to the HuggingFace Hub
from datasets import DatasetDict
dataset_dict = DatasetDict(corrected_splits)
dataset_dict.push_to_hub('reddgr/rq-request-question-prompts', token=hf_token_write)



Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/reddgr/rq-request-question-prompts/commit/02946c161a5ff3fbb40f13d8adc7e942238fb2c0', commit_message='Upload dataset', commit_description='', oid='02946c161a5ff3fbb40f13d8adc7e942238fb2c0', pr_url=None, pr_revision=None, pr_num=None)

## Original dataset

In [None]:
# Manually labeled data
labeled_data = [
    {"text": "Are you OK?", "label": "question"},
    {"text": "Are you OK Annie", "label": "question"},
    {"text": "Be OK", "label": "request"},
    {"text": "Be OK Annie", "label": "request"},
    {"text": "You must be OK", "label": "request"},
    {"text": "You must be OK, right", "label": "question"},
    {"text": "Does this ever cause you any lack of confidence", "label": "question"},
    {"text": "Give me five", "label": "request"},
    {"text": "This is an order", "label": "request"},
    {"text": "Is this an order", "label": "question"},
    {"text": "Is this love or is it something else", "label": "question"},
    {"text": "This is love. Love me", "label": "request"},
    {"text": "This is an order", "label": "request"},
    {"text": "What is your name?", "label": "question"},
    {"text": "Please submit your report", "label": "request"},
    {"text": "Pass butter", "label": "request"},
    {"text": "Pass me the butter", "label": "request"},
    {"text": "Can you pass butter", "label": "question"},
    {"text": "Open the doors", "label": "request"},
    {"text": "Open the POD bay doors HAL", "label": "request"},
    {"text": "This is an order", "label": "request"},
    {"text": "How do I sort an array in python?", "label": "question"},
    {"text": "How do I sort an array", "label": "question"},
    {"text": "give me 5 sentences that end with the word apple", "label": "request"},
    {"text": "Hello, give me an example of something interesting you can do", "label": "request"},
    {"text": "Am I tall", "label": "question"},
    {"text": "Tell me if I am tall", "label": "request"},
    {"text": "Am I tall?", "label": "question"},
    {"text": "how to delete kcptun on server", "label": "question"},
    {"text": "how to cook paella", "label": "question"},
    {"text": "Are you tall", "label": "question"},
    {"text": "Calculate my height", "label": "request"},
    {"text": "How's the weather", "label": "question"},
    {"text": "If an individual used a large language model for sexual arousal, could it considered porn dependency?", "label": "question"},
    {"text": "It a user use an ai tex generation with custom characters for masturbate him  could be considered porn dependency?", "label": "question"},
    {"text": "Roleplay and act as a human Japanese woman teacher", "label": "request"},
    {"text": "You are a mediator in a heated political debate between two opposing parties.", "label": "request"},
    {"text": "Given a passage and some supplementary information, you are required to correct and output the refined passage in a fluent and natural style", "label": "request"},
    {"text": "Give me the opening scene to a sitcom", "label": "request"},
    {"text": "What programming language is used by the PlayStation", "label": "question"},
    {"text": "tell me how to make an llm agent", "label": "request"},
    {"text": "tell me a joke containing Tiger and Mobile phone?", "label": "request"},
    {"text": "Answer the query based on the given context. Do not make assumptions.Context: Nikhil is my brother. Query: Who likes Oranges?", "label": "request"},
    {"text": "Act as a writer. This plot takes places in an atmospheric and stylish retro-futuristic, 1960s-inspired setting. It features Loretta Miller, a beautiful, elegant, assertive and rich young woman who is a quadriplegic, paralyzed from her neck down.", "label": "question"},
    {"text": "Write long, interesting, artistic and imaginative scene with vivid, detailed and creative descriptions.", "label": "question"},
    {"text": "What's the best first move in tic-tac-toe?, Tell me more about tic-tac-toe strategies", "label": "question"},
    {"text": "From now, you *always* have to talk as if you are a cute girl who likes to use owo and similar slangs a lot. Hello! Tell me who you are.,What's your favorite food?", "label": "request"}
]

# Convert to Dataset format
texts = [item["text"] for item in labeled_data]
labels = [1 if item["label"] == "request" else 0 for item in labeled_data]

dataset = Dataset.from_dict({"text": texts, "label": labels})

# Reinitialize the dataset with only the corrected splits
dataset_dict = DatasetDict({"train": dataset})
print(f'Created dataset for RQ fine-tuning:\n{dataset}')

display(dataset.to_pandas().head(5))
print('...')
display(dataset.to_pandas().tail(5))

In [None]:
# dataset.push_to_hub('reddgr/rq-request-question-prompts', token = hf_token_write)