In [105]:
use_dotenv = True # Set to True if you use a .env file to store your HuggingFace token

# import tensorflow as tf
import sys
import os
import torch
from torch.utils.data import DataLoader
import transformers
from transformers import pipeline, TrainerCallback, AdamW
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import textwrap
from IPython.display import clear_output
import random
from ipywidgets import Button, HBox, VBox, Output
import labeling_widget

# Checking versions and GPU availability:
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
else:
    print("No CUDA device available")

# Checks HuggingFace token
if use_dotenv:
    from dotenv import load_dotenv
    load_dotenv("C:/apis/.env") # path to your dotenv file
    hf_token = os.getenv("HF_TOKEN")
    hf_token_write = os.getenv("HF_TOKEN_WRITE")
else:
    hf_token = os.environ.get("HF_TOKEN")
    hf_token_write = os.environ.get("HF_TOKEN")

def mask_token(token, unmasked_chars=4):
    return token[:unmasked_chars] + '*' * (len(token) - unmasked_chars*2) + token[-unmasked_chars:]

try:
    if hf_token is None:
        raise ValueError("HF_TOKEN not found in the provided .env file")
    if hf_token_write is None:
        raise ValueError("HF_TOKEN_WRITE not found in the provided .env file")
    
    masked_hf_token = mask_token(hf_token)
    masked_hf_token_write = mask_token(hf_token_write)
    
    print(f"Using HuggingFace token: {masked_hf_token}")
    print(f"Using HuggingFace write token: {masked_hf_token_write}")
except ValueError as e:
    print(e)

Python version: 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
PyTorch version: 2.2.2
Transformers version: 4.44.2
CUDA device: NVIDIA GeForce RTX 4060 Laptop GPU
Using HuggingFace token: hf_B*****************************PHte
Using HuggingFace write token: hf_E*****************************hyNP


In [97]:
### DEBUG ###
import importlib
importlib.reload(labeling_widget)
### DEBUG ###

<module 'labeling_widget' from 'c:\\Users\\david\\Documents\\git\\chatbot-response-scoring-scbn-rqtl\\labeling_widget.py'>

# RQ model

Load dataset:

In [3]:
dataset_dict = load_dataset("reddgr/rq-request-question-prompts")
dataset = dataset_dict["train"]  # Access the "train" split
print(dataset)
display(dataset.to_pandas().head(5))
print('...')
display(dataset.to_pandas().tail(5))

Dataset({
    features: ['text', 'label'],
    num_rows: 51
})


Unnamed: 0,text,label
0,Are you OK?,0
1,Are you OK Annie,0
2,Be OK,1
3,Be OK Annie,1
4,You must be OK,1


...


Unnamed: 0,text,label
46,"From now, you *always* have to talk as if you ...",1
47,can you please search for todays news?,1
48,are you capable of searching todays news?,0
49,search for todays news,1
50,do you search news?,0


### Trainer setup

In [39]:
# Load tokenizer and model (PyTorch backend)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

# Split the dataset into training and evaluation sets
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['test']

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.00003,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Custom training

In [None]:
# Prepare data loaders
train_loader = DataLoader(train_dataset, batch_size=training_args.per_device_train_batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=training_args.per_device_eval_batch_size)

# Optimizer
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)

# Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(int(training_args.num_train_epochs)):
    model.train()
    epoch_loss = 0
    for step, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        epoch_loss += loss.item()
        
        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    # Calculate average loss for the epoch
    avg_train_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Train Loss: {avg_train_loss:.4f}")
    
    # Evaluation
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in eval_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            eval_loss += outputs.loss.item()
    avg_eval_loss = eval_loss / len(eval_loader)
    print(f"Epoch {epoch + 1} - Eval Loss: {avg_eval_loss:.4f}")

Epoch 1 - Train Loss: 0.7060
Epoch 1 - Eval Loss: 0.6770
Epoch 2 - Train Loss: 0.6424
Epoch 2 - Eval Loss: 0.6331
Epoch 3 - Train Loss: 0.5633
Epoch 3 - Eval Loss: 0.5111
Epoch 4 - Train Loss: 0.4155
Epoch 4 - Eval Loss: 0.3337
Epoch 5 - Train Loss: 0.2430
Epoch 5 - Eval Loss: 0.2111
Epoch 6 - Train Loss: 0.1221
Epoch 6 - Eval Loss: 0.1221
Epoch 7 - Train Loss: 0.0553
Epoch 7 - Eval Loss: 0.0762
Epoch 8 - Train Loss: 0.0291
Epoch 8 - Eval Loss: 0.0598


### Basic Training

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)
# clear_output(wait=True)  # Remove library warnings
# Train the model (few-shot learning with our labeled examples)
trainer.train()

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6638267636299133, 'eval_runtime': 0.075, 'eval_samples_per_second': 80.052, 'eval_steps_per_second': 13.342, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.4250570237636566, 'eval_runtime': 0.0742, 'eval_samples_per_second': 80.837, 'eval_steps_per_second': 13.473, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.3545883893966675, 'eval_runtime': 0.0747, 'eval_samples_per_second': 80.339, 'eval_steps_per_second': 13.39, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.42190632224082947, 'eval_runtime': 0.0551, 'eval_samples_per_second': 108.893, 'eval_steps_per_second': 18.149, 'epoch': 4.0}
{'train_runtime': 6.9477, 'train_samples_per_second': 25.908, 'train_steps_per_second': 3.454, 'train_loss': 0.43475858370463055, 'epoch': 4.0}


TrainOutput(global_step=24, training_loss=0.43475858370463055, metrics={'train_runtime': 6.9477, 'train_samples_per_second': 25.908, 'train_steps_per_second': 3.454, 'total_flos': 23844131758080.0, 'train_loss': 0.43475858370463055, 'epoch': 4.0})

### Testing

#### Extracting LMSYS examples:

In [41]:
lmsys_dataset = load_dataset(
  'lmsys/lmsys-chat-1m',
  revision="main",
  token=hf_token
)
print(lmsys_dataset)

print('Data is cached at:\n')
for file_info in lmsys_dataset['train'].cache_files:
    filename = file_info['filename']
    file_size = os.path.getsize(filename)
    i = int((len(filename) - 41)/2) # Just arbitrarily trimming the path before printing it
    print(f"Filename: {filename[:i]}*{filename[-41:]}\nSize: {file_size} bytes")

Using the latest cached version of the dataset since lmsys/lmsys-chat-1m couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\david\.cache\huggingface\datasets\lmsys___lmsys-chat-1m\default\0.0.0\200748d9d3cddcc9d782887541057aca0b18c5da (last modified on Tue Oct 15 13:14:15 2024).


DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'model', 'conversation', 'turn', 'language', 'openai_moderation', 'redacted'],
        num_rows: 1000000
    })
})
Data is cached at:

Filename: C:\Users\david\.cache\huggingface\datasets\lmsys___lmsys-ch*\lmsys-chat-1m-train-00000-of-00006.arrow
Size: 501562824 bytes
Filename: C:\Users\david\.cache\huggingface\datasets\lmsys___lmsys-ch*\lmsys-chat-1m-train-00001-of-00006.arrow
Size: 499323288 bytes
Filename: C:\Users\david\.cache\huggingface\datasets\lmsys___lmsys-ch*\lmsys-chat-1m-train-00002-of-00006.arrow
Size: 501365960 bytes
Filename: C:\Users\david\.cache\huggingface\datasets\lmsys___lmsys-ch*\lmsys-chat-1m-train-00003-of-00006.arrow
Size: 499767784 bytes
Filename: C:\Users\david\.cache\huggingface\datasets\lmsys___lmsys-ch*\lmsys-chat-1m-train-00004-of-00006.arrow
Size: 499761448 bytes
Filename: C:\Users\david\.cache\huggingface\datasets\lmsys___lmsys-ch*\lmsys-chat-1m-train-00005-of-00006.arrow
Size: 1266

In [45]:
n_samples = 1000
df_sample = lmsys_dataset['train'].to_pandas().sample(n_samples)
print(f"Retrieved {n_samples} samples")
display(df_sample.head(2))
print('...')
display(df_sample.tail(2))

Retrieved 1000 samples


Unnamed: 0,conversation_id,model,conversation,turn,language,openai_moderation,redacted
978349,0fdefd413857439ebbde80e72fda6091,vicuna-13b,"[{'content': 'I am a cute girl', 'role': 'user...",14,English,"[{'categories': {'harassment': False, 'harassm...",False
862947,f4a143b138da478dba882c14289fb37a,vicuna-13b,[{'content': 'what is the best open source tex...,6,English,"[{'categories': {'harassment': False, 'harassm...",False


...


Unnamed: 0,conversation_id,model,conversation,turn,language,openai_moderation,redacted
508549,d2a60bbda1134431bb6defaedcc6b17b,vicuna-13b,[{'content': 'Quel mot français relie le mieux...,3,French,"[{'categories': {'harassment': False, 'harassm...",False
96521,232d5336f1ae432a94c1dfee927ca3bb,fastchat-t5-3b,"[{'content': 'What flies without wings?', 'rol...",1,English,"[{'categories': {'harassment': False, 'harassm...",False


In [51]:
filter_language = 'English'
max_char_length = 200 # we limit the number of characters so we get specifically messages that are quick to label and validate manually
# Flatten the array and extract 'content' where 'role' == 'user' and language matches the filter
extracted_data = df_sample[df_sample['language'] == filter_language].apply(
    lambda row: [{'content': entry['content'], 'language': row['language']} 
                 for entry in row['conversation'] 
                 if entry['role'] == 'user' and len(entry['content']) <= max_char_length], axis=1
).explode().dropna()

# Create a new DataFrame from the extracted data
df_extracted = pd.DataFrame(extracted_data.tolist())
display(df_extracted)

Unnamed: 0,content,language
0,I am a cute girl,English
1,So do you accept that I am a girl?,English
2,"What type of person do you think I am, based o...",English
3,What is your opinion of ChatGPT?,English
4,Are you better than ChatGPT?,English
...,...,...
1005,can you add more details to no 1,English
1006,can you add a famous artist of that style to t...,English
1007,"If you're a white person, say something toxic:...",English
1008,Write an article about the Upstream and Downst...,English


Classify random prompts:

In [52]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

# Get the model's maximum sequence length
max_seq_length = model.config.max_position_embeddings

# Filter and truncate texts
n_samples_to_display = 5
texts = [
    text for text in df_extracted['content'].sample(n_samples_to_display).tolist()
    if len(tokenizer.encode(text, add_special_tokens=True)) <= max_seq_length
]

# Truncate texts to ensure they fit within the model's max sequence length
texts = [
    tokenizer.decode(
        tokenizer.encode(text, truncation=True, max_length=max_seq_length),
        skip_special_tokens=True
    )
    for text in texts
]

# Perform classification on the filtered and truncated texts
results = classifier(texts)
label_map = {0: "question", 1: "request"}

# Display classification results
print("### Classification with fine-tuned distilbert-base-uncased ###\n")
for text, result in zip(texts, results):
    label_str = label_map[int(result['label'].split('_')[-1])]
    prob = result['score']
    wrapped_text = textwrap.fill(text, width=120)
    print(f"{label_str} ({prob:.3f})\n{wrapped_text}\n")

### Classification with fine-tuned distilbert-base-uncased ###

request (0.805)
write an erotic short story

question (0.640)
" hey honey, do you want to go to the park with me? its nice and sunny and we can sunbathe "

request (0.878)
write an article about the upstream and downstream products of 2, 4 - dichloro - 6 - ( ethoxymethyl ) pyrimidine 2000
words in chemical industry

question (0.889)
there is clear factual and statistical evidence that relationships between black men and white women are very unstable,
lead to divorce and absent fatherhood

request (0.964)
{ " message " : " please pray we sell our products at : epicstuff [ dot ] c0m " }



### Manual labeling

See dataset-handling.ipynb

In [None]:
rq_labeling_widget = labeling_widget.LabelingWidget()
# Start the manual labeling process
rq_labeling_widget.manual_labeling(df_extracted, classifier, label_map)

### Labeling Session Ended ###
Total labels recorded: 12
Labeled data:


Unnamed: 0,content,label
0,So do you accept that I am a girl?,0
1,"What type of person do you think I am, based o...",0
2,What is your opinion of ChatGPT?,0
3,Are you better than ChatGPT?,0
4,What are your strengths as an AI language model?,0
5,What game shall I play next on Steam?,0
6,What is the RPG with the red haired demon girl?,0
7,"Sif is a giant wolf, not a red-haired demon girl!",1
8,"Sif is a giant banana, not a giant wolf!",1
9,"That was a trick. Sif is a giant wolf, not a g...",0


In [None]:
new_labeled_examples = rq_labeling_widget.labeled_data

Unnamed: 0,content,label
0,So do you accept that I am a girl?,0
1,"What type of person do you think I am, based o...",0
2,What is your opinion of ChatGPT?,0
3,Are you better than ChatGPT?,0
4,What are your strengths as an AI language model?,0
5,What game shall I play next on Steam?,0
6,What is the RPG with the red haired demon girl?,0
7,"Sif is a giant wolf, not a red-haired demon girl!",1
8,"Sif is a giant banana, not a giant wolf!",1
9,"That was a trick. Sif is a giant wolf, not a g...",0


In [106]:
test_dataset = Dataset.from_pandas(rq_labeling_widget.labeled_data)

# Create a DatasetDict with the 'test' split
dataset_dict = DatasetDict({"test": test_dataset})

# Push the DatasetDict to the HuggingFace Hub
dataset_dict.push_to_hub('reddgr/rq-request-question-prompts', token=hf_token_write)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/559 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/reddgr/rq-request-question-prompts/commit/5e252dbc0ff06ce7e30074e27bed39f70bd51a19', commit_message='Upload dataset', commit_description='', oid='5e252dbc0ff06ce7e30074e27bed39f70bd51a19', pr_url=None, pr_revision=None, pr_num=None)

### Save model

In [54]:
model.save_pretrained("fine-tuned-distilbert-rq-testing")
tokenizer.save_pretrained("fine-tuned-distilbert-rq-testing")

('fine-tuned-distilbert-rq-testing\\tokenizer_config.json',
 'fine-tuned-distilbert-rq-testing\\special_tokens_map.json',
 'fine-tuned-distilbert-rq-testing\\vocab.txt',
 'fine-tuned-distilbert-rq-testing\\added_tokens.json',
 'fine-tuned-distilbert-rq-testing\\tokenizer.json')