In [16]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, PeftModel, TaskType

In [17]:
# Load the dataset from the jsonl file
def load_jsonl_dataset(file_path):
    # Each line in the JSONL file is a JSON object
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]
    return data

data = load_jsonl_dataset('../LoRA/data/train.jsonl')

# Process the data to extract relevant text for training
def extract_text(data):
    # We will create a list of dictionaries containing input and output pairs
    extracted_data = []
    for item in data:
        # Assuming we are only interested in user messages
        user_messages = [msg['content'] for msg in item['messages'] if msg['role'] == 'user']
        if user_messages:
            # Join all user messages for simplicity
            combined_message = " ".join(user_messages)
            extracted_data.append({"text": combined_message})
    return extracted_data

processed_data = extract_text(data)

# Convert the processed data into a Hugging Face Dataset
dataset = Dataset.from_list(processed_data)

In [8]:
# Convert dataset to Hugging Face format
# 

# Load TinyBERT tokenizer and model
model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Prepare LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1, 
    target_modules=["query", "value"]
)

In [11]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [19]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/197 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [22]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  
    # eval_dataset=tokenized_dataset,
)

# Fine-tune the model
trainer.train()

RuntimeError: The size of tensor a (14831) must match the size of tensor b (512) at non-singleton dimension 1

In [21]:
model.save_pretrained('./lora-tinybert')
tokenizer.save_pretrained('./lora-tinybert')

('./lora-tinybert/tokenizer_config.json',
 './lora-tinybert/special_tokens_map.json',
 './lora-tinybert/vocab.txt',
 './lora-tinybert/added_tokens.json',
 './lora-tinybert/tokenizer.json')