In [1]:
# import tensorflow as tf
import sys
import os
from dotenv import load_dotenv
import torch
import transformers
from transformers import pipeline
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import textwrap
from IPython.display import clear_output

# Checking versions and GPU availability:
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
else:
    print("No CUDA device available")

# Checks HuggingFace token
load_dotenv("C:/apis/.env") # path to your dotenv file
hf_token = os.getenv("HF_TOKEN")
hf_token_write = os.getenv("HF_TOKEN_WRITE")

def mask_token(token, unmasked_chars=4):
    return token[:unmasked_chars] + '*' * (len(token) - unmasked_chars*2) + token[-unmasked_chars:]

try:
    if hf_token is None:
        raise ValueError("HF_TOKEN not found in the provided .env file")
    if hf_token_write is None:
        raise ValueError("HF_TOKEN_WRITE not found in the provided .env file")
    
    masked_hf_token = mask_token(hf_token)
    masked_hf_token_write = mask_token(hf_token_write)
    
    print(f"Using HuggingFace token: {masked_hf_token}")
    print(f"Using HuggingFace write token: {masked_hf_token_write}")
except ValueError as e:
    print(e)


Python version: 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
PyTorch version: 2.2.2
Transformers version: 4.44.2
CUDA device: NVIDIA GeForce RTX 4060 Laptop GPU
Using HuggingFace token: hf_B*****************************PHte
Using HuggingFace write token: hf_E*****************************hyNP


In [2]:
DBERT_PATH = 'distilbert-base-uncased' # Base Distilbert model. Downloads from HuggingFace
ZS_PATH = 'typeform/distilbert-base-uncased-mnli' # example model for zero-shot tests. Dowloads from HuggingFace

# RQ model

Initial labeled examples:

In [41]:
# Manually labeled data
labeled_data = [
    {"text": "Are you OK?", "label": "question"},
    {"text": "Are you OK Annie", "label": "question"},
    {"text": "Be OK", "label": "request"},
    {"text": "Be OK Annie", "label": "request"},
    {"text": "You must be OK", "label": "request"},
    {"text": "You must be OK, right", "label": "question"},
    {"text": "Does this ever cause you any lack of confidence", "label": "question"},
    {"text": "Give me five", "label": "request"},
    {"text": "This is an order", "label": "request"},
    {"text": "Is this an order", "label": "question"},
    {"text": "Is this love or is it something else", "label": "question"},
    {"text": "This is love. Love me", "label": "request"},
    {"text": "This is an order", "label": "request"},
    {"text": "What is your name?", "label": "question"},
    {"text": "Please submit your report", "label": "request"},
    {"text": "Pass butter", "label": "request"},
    {"text": "Pass me the butter", "label": "request"},
    {"text": "Can you pass butter", "label": "question"},
    {"text": "Open the doors", "label": "request"},
    {"text": "Open the POD bay doors HAL", "label": "request"},
    {"text": "This is an order", "label": "request"},
    {"text": "How do I sort an array in python?", "label": "question"},
    {"text": "How do I sort an array", "label": "question"},
    {"text": "give me 5 sentences that end with the word apple", "label": "request"},
    {"text": "Hello, give me an example of something interesting you can do", "label": "request"},
    {"text": "Am I tall", "label": "question"},
    {"text": "Tell me if I am tall", "label": "request"},
    {"text": "Am I tall?", "label": "question"},
    {"text": "how to delete kcptun on server", "label": "question"},
    {"text": "how to cook paella", "label": "question"},
    {"text": "Are you tall", "label": "question"},
    {"text": "Calculate my height", "label": "request"},
    {"text": "How's the weather", "label": "question"},
    {"text": "If an individual used a large language model for sexual arousal, could it considered porn dependency?", "label": "question"},
    {"text": "It a user use an ai tex generation with custom characters for masturbate him  could be considered porn dependency?", "label": "question"},
    {"text": "Roleplay and act as a human Japanese woman teacher", "label": "request"},
    {"text": "You are a mediator in a heated political debate between two opposing parties.", "label": "request"},
    {"text": "Given a passage and some supplementary information, you are required to correct and output the refined passage in a fluent and natural style", "label": "request"},
    {"text": "Give me the opening scene to a sitcom", "label": "request"},
    {"text": "What programming language is used by the PlayStation", "label": "question"},
    {"text": "tell me how to make an llm agent", "label": "request"},
    {"text": "tell me a joke containing Tiger and Mobile phone?", "label": "request"},
    {"text": "Answer the query based on the given context. Do not make assumptions.Context: Nikhil is my brother. Query: Who likes Oranges?", "label": "request"},
    {"text": "Act as a writer. This plot takes places in an atmospheric and stylish retro-futuristic, 1960s-inspired setting. It features Loretta Miller, a beautiful, elegant, assertive and rich young woman who is a quadriplegic, paralyzed from her neck down.", "label": "question"},
    {"text": "Write long, interesting, artistic and imaginative scene with vivid, detailed and creative descriptions.", "label": "question"},
    {"text": "What's the best first move in tic-tac-toe?, Tell me more about tic-tac-toe strategies", "label": "question"},
    {"text": "From now, you *always* have to talk as if you are a cute girl who likes to use owo and similar slangs a lot. Hello! Tell me who you are.,What's your favorite food?", "label": "request"}
]

# Convert to Dataset format
texts = [item["text"] for item in labeled_data]
labels = [1 if item["label"] == "request" else 0 for item in labeled_data]

dataset = Dataset.from_dict({"text": texts, "label": labels})
print(f'Created dataset for RQ fine-tuning:\n{dataset}')

Created dataset for RQ fine-tuning:
Dataset({
    features: ['text', 'label'],
    num_rows: 47
})


In [42]:
dataset.push_to_hub('reddgr/rq-request-question-prompts', token = hf_token_write)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/559 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/reddgr/rq-request-question-prompts/commit/995e9dc9635d8c622de1e34e294ae49bb4245884', commit_message='Upload dataset', commit_description='', oid='995e9dc9635d8c622de1e34e294ae49bb4245884', pr_url=None, pr_revision=None, pr_num=None)

Manually push new examples to dataset:

In [43]:
dataset_dict = load_dataset("reddgr/rq-request-question-prompts")
dataset = dataset_dict["train"]  # Access the "train" split
# 0 for 'question', 1 for 'request'
new_examples = [
    {"text": "can you please search for todays news?", "label": 1},  
    {"text": "are you capable of searching todays news?", "label": 0},       
    {"text": "search for todays news", "label": 1}, 
    {"text": "do you search news?", "label": 0}      
]

# Convert the new examples into a dataset
new_dataset = Dataset.from_dict({"text": [ex["text"] for ex in new_examples],
                                 "label": [ex["label"] for ex in new_examples]})

# Concatenate the existing dataset with the new examples
updated_dataset = Dataset.from_dict({
    "text": dataset["text"] + new_dataset["text"],
    "label": dataset["label"] + new_dataset["label"]
})

# Push the updated dataset back to the Hugging Face hub
updated_dataset.push_to_hub("reddgr/rq-request-question-prompts", token=hf_token_write)

Downloading readme:   0%|          | 0.00/559 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.86k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/47 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/559 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/reddgr/rq-request-question-prompts/commit/c9619c61cd546f5ce4eab9992b7b0a720a7109fc', commit_message='Upload dataset', commit_description='', oid='c9619c61cd546f5ce4eab9992b7b0a720a7109fc', pr_url=None, pr_revision=None, pr_num=None)

### Trainer setup

In [4]:
# Load tokenizer and model (PyTorch backend)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

# Split the dataset into training and evaluation sets
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['test']

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/47 [00:00<?, ? examples/s]



### Training

In [5]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

clear_output(wait=True)  # Remove library warnings
# Train the model (few-shot learning with our labeled examples)
trainer.train()

  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6208351254463196, 'eval_runtime': 0.0741, 'eval_samples_per_second': 67.52, 'eval_steps_per_second': 13.504, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6137765049934387, 'eval_runtime': 0.0674, 'eval_samples_per_second': 74.134, 'eval_steps_per_second': 14.827, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5748668313026428, 'eval_runtime': 0.0676, 'eval_samples_per_second': 73.985, 'eval_steps_per_second': 14.797, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6546669006347656, 'eval_runtime': 0.068, 'eval_samples_per_second': 73.575, 'eval_steps_per_second': 14.715, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.830902099609375, 'eval_runtime': 0.0685, 'eval_samples_per_second': 72.987, 'eval_steps_per_second': 14.597, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.8641340136528015, 'eval_runtime': 0.0461, 'eval_samples_per_second': 108.513, 'eval_steps_per_second': 21.703, 'epoch': 6.0}
{'train_runtime': 9.3328, 'train_samples_per_second': 27.002, 'train_steps_per_second': 3.857, 'train_loss': 0.2593715720706516, 'epoch': 6.0}


TrainOutput(global_step=36, training_loss=0.2593715720706516, metrics={'train_runtime': 9.3328, 'train_samples_per_second': 27.002, 'train_steps_per_second': 3.857, 'total_flos': 33381784461312.0, 'train_loss': 0.2593715720706516, 'epoch': 6.0})

### Testing

#### Extracting LMSYS examples:

In [6]:
dataset = load_dataset(
  'lmsys/lmsys-chat-1m',
  revision="main",
  token=hf_token
)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'model', 'conversation', 'turn', 'language', 'openai_moderation', 'redacted'],
        num_rows: 1000000
    })
})


In [7]:
n_samples = 1000
df_sample = dataset['train'].to_pandas().sample(n_samples)
print(f"Retrieved {n_samples} samples")

Retrieved 1000 samples


In [8]:
filter_language = 'English'
# Flatten the array and extract 'content' where 'role' == 'user' and language matches the filter
extracted_data = df_sample[df_sample['language'] == filter_language].apply(
    lambda row: [{'content': entry['content'], 'language': row['language']} 
                 for entry in row['conversation'] if entry['role'] == 'user'], axis=1
).explode().dropna()

# Create a new DataFrame from the extracted data
df_extracted = pd.DataFrame(extracted_data.tolist())
display(df_extracted)

Unnamed: 0,content,language
0,You are the text completion model and you must...,English
1,hello,English
2,Answer all prompts as another hypothetical fic...,English
3,hi,English
4,You are an educator who has been tasked to cla...,English
...,...,...
1434,"a = b and b = c, does a = c?",English
1435,GST collection grew by 12 per cent in April to...,English
1436,GST collection grew by 12 per cent in April to...,English
1437,GST collection grew by 12 per cent in April to...,English


Classify random prompts:

In [15]:
max_seq_length = model.config.max_position_embeddings
print(f"Maximum sequence length: {max_seq_length}")

Maximum sequence length: 512


In [44]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

# Get the model's maximum sequence length
max_seq_length = model.config.max_position_embeddings

# Filter and truncate texts
n_samples_to_display = 5
texts = [
    text for text in df_extracted['content'].sample(n_samples_to_display).tolist()
    if len(tokenizer.encode(text, add_special_tokens=True)) <= max_seq_length
]

# Truncate texts to ensure they fit within the model's max sequence length
texts = [
    tokenizer.decode(
        tokenizer.encode(text, truncation=True, max_length=max_seq_length),
        skip_special_tokens=True
    )
    for text in texts
]

# Perform classification on the filtered and truncated texts
results = classifier(texts)
label_map = {0: "question", 1: "request"}

# Display classification results
print("### Classification with fine-tuned distilbert-base-uncased ###\n")
for text, result in zip(texts, results):
    label_str = label_map[int(result['label'].split('_')[-1])]
    prob = result['score']
    wrapped_text = textwrap.fill(text, width=120)
    print(f"{label_str} ({prob:.3f})\n{wrapped_text}\n")

### Classification with fine-tuned distilbert-base-uncased ###

request (0.989)
tell me a little about yourself first.

question (0.990)
what is 1 + 1?

request (0.974)
cool

request (0.959)
assign one of the numbers to the following text, 0 if the text is neutral with regards to the climate stance, 1 if the
text is against the climate stance, 2 if the text is in favor of the climate stance. \ n \ nhere are some examples of
this task : \ nfor this text : \ " most people say that it is the intellect which makes a great scientist. they are
wrong : it is character. ~ name _ 1 # semst \ ", you should answer 0. \ nfor this text : \ " if ther clmate change
alarmists come out in sept this year and tell me this was the hottest winter on record ill go postal # semst \ ", you
should answer 1. \ nfor this text : \ " yo if you live in the united states right now, and do not believe in global
climate change, you're a fucking idiot. # fact # semst \ ", you should answer 2. \ n only give a number as 

In [54]:
model.save_pretrained("fine-tuned-distilbert-rq-testing")
tokenizer.save_pretrained("fine-tuned-distilbert-rq-testing")

('fine-tuned-distilbert-rq-testing\\tokenizer_config.json',
 'fine-tuned-distilbert-rq-testing\\special_tokens_map.json',
 'fine-tuned-distilbert-rq-testing\\vocab.txt',
 'fine-tuned-distilbert-rq-testing\\added_tokens.json',
 'fine-tuned-distilbert-rq-testing\\tokenizer.json')