In [None]:
pip install datasets torch peft scikit-learn
pip install huggingface_hub==0.19.4
pip install transformers==4.36.2
pip install peft==0.7.1

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    GPT2ForSequenceClassification,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    TrainerCallback
)
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import logging
import sys

In [None]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

# Load dataset from JSONL file
dataset = load_dataset('json', data_files='/kaggle/input/training/training.jsonl')['train']

# Initialize label encoders
sentiment_encoder = LabelEncoder()
topic_encoder = LabelEncoder()

# Fit label encoders
sentiments = dataset['sentiment']
topics = dataset['topic']
sentiment_encoder.fit(sentiments)
topic_encoder.fit(topics)

# Print unique labels and their counts
print("Sentiment labels:", sentiment_encoder.classes_)
print("Topic labels:", topic_encoder.classes_)
n_sentiments = len(sentiment_encoder.classes_)
n_topics = len(topic_encoder.classes_)

In [None]:
# Prepare dataset with one-hot encoded labels
def prepare_dataset(examples):
    # Create text inputs
    texts = [f"Review: {rev}\nQuery: {q}" for rev, q in zip(examples['review'], examples['query'])]
    
    # Convert labels to indices
    sentiment_indices = sentiment_encoder.transform(examples['sentiment'])
    topic_indices = topic_encoder.transform(examples['topic'])
    
    # Create combined one-hot labels
    labels = np.zeros((len(texts), n_sentiments + n_topics))
    for i, (s_idx, t_idx) in enumerate(zip(sentiment_indices, topic_indices)):
        labels[i, s_idx] = 1  # Set sentiment one-hot
        labels[i, n_sentiments + t_idx] = 1  # Set topic one-hot
    
    return {
        'text': texts,
        'labels': labels.tolist()  # Convert to list for dataset creation
    }

# Process the dataset
processed_data = prepare_dataset(dataset)

In [None]:
# Split dataset indices
indices = list(range(len(processed_data['text'])))
train_idx, eval_idx = train_test_split(indices, test_size=0.2, random_state=42)

# Create train and eval datasets
train_dataset = Dataset.from_dict({
    'text': [processed_data['text'][i] for i in train_idx],
    'labels': [processed_data['labels'][i] for i in train_idx]
})

eval_dataset = Dataset.from_dict({
    'text': [processed_data['text'][i] for i in eval_idx],
    'labels': [processed_data['labels'][i] for i in eval_idx]
})

In [None]:
# Initialize tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding_side='right')
tokenizer.pad_token = tokenizer.eos_token

# Initialize model
model = GPT2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=n_sentiments + n_topics,  # Combined number of labels
    problem_type="multi_label_classification"
)
model.config.pad_token_id = model.config.eos_token_id

In [None]:
# Prepare model for QLoRA
model = prepare_model_for_kbit_training(model)

# Configure LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_proj", "c_attn"]
)

# Get PEFT model
model = get_peft_model(model, peft_config)

In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=256
    )

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Modified compute metrics function for multi-label classification
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(predictions)).numpy()
    
    # Split predictions and labels for sentiment and topic
    sentiment_preds = predictions[:, :n_sentiments]
    topic_preds = predictions[:, n_sentiments:]
    
    sentiment_labels = labels[:, :n_sentiments]
    topic_labels = labels[:, n_sentiments:]
    
    # Get predicted classes
    sentiment_pred_classes = np.argmax(sentiment_preds, axis=1)
    topic_pred_classes = np.argmax(topic_preds, axis=1)
    
    # Get true classes
    sentiment_true_classes = np.argmax(sentiment_labels, axis=1)
    topic_true_classes = np.argmax(topic_labels, axis=1)
    
    # Calculate accuracies
    sentiment_accuracy = (sentiment_pred_classes == sentiment_true_classes).mean()
    topic_accuracy = (topic_pred_classes == topic_true_classes).mean()
    
    return {
        "sentiment_accuracy": sentiment_accuracy,
        "topic_accuracy": topic_accuracy,
        "average_accuracy": (sentiment_accuracy + topic_accuracy) / 2
    }

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-qlora-classifier",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    report_to=["tensorboard"],
    disable_tqdm=False,
)

In [None]:
# Custom callback
class TrainingCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        logging.info(f"Starting training with {len(train_dataset)} examples")
        
    def on_epoch_begin(self, args, state, control, **kwargs):
        logging.info(f"Starting epoch {state.epoch + 1}/{args.num_train_epochs}")
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            if "learning_rate" in logs:
                logging.info(f"Learning Rate: {logs['learning_rate']:.2e}")
            if "loss" in logs:
                logging.info(f"Loss: {logs['loss']:.4f}")
            if "eval_sentiment_accuracy" in logs:
                logging.info(f"Sentiment Accuracy: {logs['eval_sentiment_accuracy']:.4f}")
            if "eval_topic_accuracy" in logs:
                logging.info(f"Topic Accuracy: {logs['eval_topic_accuracy']:.4f}")

In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[TrainingCallback()]
)

# Print training details
print("\nTraining Configuration:")
print(f"Number of training examples: {len(train_dataset)}")
print(f"Number of evaluation examples: {len(eval_dataset)}")
print(f"Number of sentiment classes: {n_sentiments}")
print(f"Number of topic classes: {n_topics}")
print(f"Number of epochs: {training_args.num_train_epochs}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Total optimization steps: {trainer.get_train_dataloader().__len__() * training_args.num_train_epochs}\n")

# Train the model
trainer.train()

In [None]:
# Save the model
trainer.save_model("./gpt2-qlora-classifier-final")

In [None]:
# Modified prediction function
def predict(text):
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )
    
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.sigmoid(logits)
    
    # Split predictions for sentiment and topic
    sentiment_preds = predictions[0, :n_sentiments]
    topic_preds = predictions[0, n_sentiments:]
    
    # Get predicted classes
    sentiment_pred = sentiment_encoder.inverse_transform([torch.argmax(sentiment_preds).item()])[0]
    topic_pred = topic_encoder.inverse_transform([torch.argmax(topic_preds).item()])[0]
    
    return {
        "sentiment": sentiment_pred,
        "topic": topic_pred,
        "sentiment_confidence": torch.max(sentiment_preds).item(),
        "topic_confidence": torch.max(topic_preds).item()
    }

In [None]:
model.to("cpu")
# Test with your own text
your_text = "Review: [this is truly something good] Query: [are they going to make anything different]"
result = predict(your_text)
print("\nInput:", your_text)
print(f"Sentiment: {result['sentiment']} (Confidence: {result['sentiment_confidence']:.2f})")
print(f"Topic: {result['topic']} (Confidence: {result['topic_confidence']:.2f})")