In [None]:
!pip install transformers peft datasets sentencepiece autoawq torch accelerate

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from datasets import Dataset
import pandas as pd
df = pd.read_csv("../synthetic-data/synthetic_reviews.csv")

# Change Labels to Int
label_map = {"positive": 2, "neutral": 1, "negative": 0}
df['label'] = df['label'].map(label_map)

dataset = Dataset.from_pandas(df)

In [None]:
from transformers import AutoTokenizer
from awq import AutoAWQForCausalLM

local_directory = "../../models/"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(local_directory)
model = AutoAWQForCausalLM.from_pretrained(
    local_directory, 
    low_cpu_mem_usage=True, 
    use_cache=False,
    local_files_only=True
)

In [None]:
model.half()
model.to("cuda")

In [None]:
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Target the attention layers
    lora_dropout=0.1,
    bias="none"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
tokenizer.pad_token = tokenizer.eos_token

# Step 1: Tokenize each text entry in the dataset individually
def tokenize_function(examples):
    result = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)
    result["labels"] = examples['label']  # Use the existing 'label' column as labels
    return result

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(['text', 'label'])
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
print(tokenized_datasets.column_names)

In [None]:
from transformers import DataCollatorForLanguageModeling

# Define a custom data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Masked language modeling is not needed for causal LM
)


In [None]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    eval_steps=500,
    fp16=True,
    remove_unused_columns=False,
)

In [None]:
# Define Trainer
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=eval_dataset,    # Evaluation dataset
    data_collator=data_collator
)

In [None]:
# Start training
trainer.train()

In [None]:
metrics = trainer.evaluate(eval_dataset=eval_dataset)
print(metrics)

In [None]:
# Save the model
model.save_pretrained("../../trained-model")
tokenizer.save_pretrained("../../trained-model")