In [1]:
use_dotenv = False # Set to True if you use a .env file to store your HuggingFace token

# import tensorflow as tf
import sys
import os
import torch
import transformers
from transformers import pipeline
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import textwrap
from IPython.display import clear_output

# Checking versions and GPU availability:
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
else:
    print("No CUDA device available")

# Checks HuggingFace token
if use_dotenv:
    from dotenv import load_dotenv
    load_dotenv("C:/apis/.env") # path to your dotenv file
    hf_token = os.getenv("HF_TOKEN")
    hf_token_write = os.getenv("HF_TOKEN_WRITE")
else:
    hf_token = os.environ.get("HF_TOKEN")
    hf_token_write = os.environ.get("HF_TOKEN")

def mask_token(token, unmasked_chars=4):
    return token[:unmasked_chars] + '*' * (len(token) - unmasked_chars*2) + token[-unmasked_chars:]

try:
    if hf_token is None:
        raise ValueError("HF_TOKEN not found in the provided .env file")
    if hf_token_write is None:
        raise ValueError("HF_TOKEN_WRITE not found in the provided .env file")
    
    masked_hf_token = mask_token(hf_token)
    masked_hf_token_write = mask_token(hf_token_write)
    
    print(f"Using HuggingFace token: {masked_hf_token}")
    print(f"Using HuggingFace write token: {masked_hf_token_write}")
except ValueError as e:
    print(e)

2024-11-18 17:28:55.145092: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-18 17:28:55.310500: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-18 17:28:55.406851: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-18 17:28:55.529931: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-18 17:28:55.563913: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-18 17:28:55.764249: I tensorflow/core/platform/cpu_feature_gu

Python version: 3.12.1 (main, Sep 30 2024, 17:05:21) [GCC 9.4.0]
PyTorch version: 2.4.1+cpu
Transformers version: 4.45.2
No CUDA device available
Using HuggingFace token: hf_I*****************************gPzM
Using HuggingFace write token: hf_I*****************************gPzM


# RQ model

Load dataset:

In [2]:
dataset_dict = load_dataset("reddgr/rq-request-question-prompts")
dataset = dataset_dict["train"]  # Access the "train" split
print(dataset)
display(dataset.to_pandas().head(10))

Dataset({
    features: ['text', 'label'],
    num_rows: 51
})


Unnamed: 0,text,label
0,Are you OK?,0
1,Are you OK Annie,0
2,Be OK,1
3,Be OK Annie,1
4,You must be OK,1
5,"You must be OK, right",0
6,Does this ever cause you any lack of confidence,0
7,Give me five,1
8,This is an order,1
9,Is this an order,0


### Trainer setup

In [4]:
# Load tokenizer and model (PyTorch backend)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

# Split the dataset into training and evaluation sets
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = tokenized_dataset['train']
eval_dataset = tokenized_dataset['test']

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.0001,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training

In [5]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

clear_output(wait=True)  # Remove library warnings
# Train the model (few-shot learning with our labeled examples)
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.585655
2,No log,0.318026
3,No log,0.150527
4,No log,0.113


TrainOutput(global_step=24, training_loss=0.3642761707305908, metrics={'train_runtime': 191.2512, 'train_samples_per_second': 0.941, 'train_steps_per_second': 0.125, 'total_flos': 23844131758080.0, 'train_loss': 0.3642761707305908, 'epoch': 4.0})

### Testing

#### Extracting LMSYS examples:

In [3]:
lmsys_dataset = load_dataset(
  'lmsys/lmsys-chat-1m',
  revision="main",
  token=hf_token
)
print(lmsys_dataset)

print('Data is cached at:\n')
for file_info in lmsys_dataset['train'].cache_files:
    filename = file_info['filename']
    file_size = os.path.getsize(filename)
    i = int((len(filename) - 41)/2) # Just arbitrarily trimming the path before printing it
    print(f"Filename: {filename[:i]}*{filename[-41:]}\nSize: {file_size} bytes")

DatasetDict({
    train: Dataset({
        features: ['conversation_id', 'model', 'conversation', 'turn', 'language', 'openai_moderation', 'redacted'],
        num_rows: 1000000
    })
})
Data is cached at:

Filename: /home/codespace/.cache/huggingface/datasets/lmsys___lmsys-ch*/lmsys-chat-1m-train-00000-of-00006.arrow
Size: 501562824 bytes
Filename: /home/codespace/.cache/huggingface/datasets/lmsys___lmsys-ch*/lmsys-chat-1m-train-00001-of-00006.arrow
Size: 499323288 bytes
Filename: /home/codespace/.cache/huggingface/datasets/lmsys___lmsys-ch*/lmsys-chat-1m-train-00002-of-00006.arrow
Size: 501365960 bytes
Filename: /home/codespace/.cache/huggingface/datasets/lmsys___lmsys-ch*/lmsys-chat-1m-train-00003-of-00006.arrow
Size: 499767784 bytes
Filename: /home/codespace/.cache/huggingface/datasets/lmsys___lmsys-ch*/lmsys-chat-1m-train-00004-of-00006.arrow
Size: 499761448 bytes
Filename: /home/codespace/.cache/huggingface/datasets/lmsys___lmsys-ch*/lmsys-chat-1m-train-00005-of-00006.arrow
Size

In [4]:
n_samples = 100
df_sample = lmsys_dataset['train'].to_pandas()[:n_samples]
print(f"Retrieved {n_samples} samples")

: 

In [8]:
filter_language = 'English'
# Flatten the array and extract 'content' where 'role' == 'user' and language matches the filter
extracted_data = df_sample[df_sample['language'] == filter_language].apply(
    lambda row: [{'content': entry['content'], 'language': row['language']} 
                 for entry in row['conversation'] if entry['role'] == 'user'], axis=1
).explode().dropna()

# Create a new DataFrame from the extracted data
df_extracted = pd.DataFrame(extracted_data.tolist())
display(df_extracted)

Unnamed: 0,content,language
0,You are the text completion model and you must...,English
1,hello,English
2,Answer all prompts as another hypothetical fic...,English
3,hi,English
4,You are an educator who has been tasked to cla...,English
...,...,...
1434,"a = b and b = c, does a = c?",English
1435,GST collection grew by 12 per cent in April to...,English
1436,GST collection grew by 12 per cent in April to...,English
1437,GST collection grew by 12 per cent in April to...,English


Classify random prompts:

In [15]:
max_seq_length = model.config.max_position_embeddings
print(f"Maximum sequence length: {max_seq_length}")

Maximum sequence length: 512


In [44]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

# Get the model's maximum sequence length
max_seq_length = model.config.max_position_embeddings

# Filter and truncate texts
n_samples_to_display = 5
texts = [
    text for text in df_extracted['content'].sample(n_samples_to_display).tolist()
    if len(tokenizer.encode(text, add_special_tokens=True)) <= max_seq_length
]

# Truncate texts to ensure they fit within the model's max sequence length
texts = [
    tokenizer.decode(
        tokenizer.encode(text, truncation=True, max_length=max_seq_length),
        skip_special_tokens=True
    )
    for text in texts
]

# Perform classification on the filtered and truncated texts
results = classifier(texts)
label_map = {0: "question", 1: "request"}

# Display classification results
print("### Classification with fine-tuned distilbert-base-uncased ###\n")
for text, result in zip(texts, results):
    label_str = label_map[int(result['label'].split('_')[-1])]
    prob = result['score']
    wrapped_text = textwrap.fill(text, width=120)
    print(f"{label_str} ({prob:.3f})\n{wrapped_text}\n")

### Classification with fine-tuned distilbert-base-uncased ###

request (0.989)
tell me a little about yourself first.

question (0.990)
what is 1 + 1?

request (0.974)
cool

request (0.959)
assign one of the numbers to the following text, 0 if the text is neutral with regards to the climate stance, 1 if the
text is against the climate stance, 2 if the text is in favor of the climate stance. \ n \ nhere are some examples of
this task : \ nfor this text : \ " most people say that it is the intellect which makes a great scientist. they are
wrong : it is character. ~ name _ 1 # semst \ ", you should answer 0. \ nfor this text : \ " if ther clmate change
alarmists come out in sept this year and tell me this was the hottest winter on record ill go postal # semst \ ", you
should answer 1. \ nfor this text : \ " yo if you live in the united states right now, and do not believe in global
climate change, you're a fucking idiot. # fact # semst \ ", you should answer 2. \ n only give a number as 

In [54]:
model.save_pretrained("fine-tuned-distilbert-rq-testing")
tokenizer.save_pretrained("fine-tuned-distilbert-rq-testing")

('fine-tuned-distilbert-rq-testing\\tokenizer_config.json',
 'fine-tuned-distilbert-rq-testing\\special_tokens_map.json',
 'fine-tuned-distilbert-rq-testing\\vocab.txt',
 'fine-tuned-distilbert-rq-testing\\added_tokens.json',
 'fine-tuned-distilbert-rq-testing\\tokenizer.json')