In [None]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 1. Data Preparation


def prepare_sentiment_dataset(data_path):
    """
    Prepare dataset from a CSV file containing 'text' and 'sentiment' columns
    Sentiment should be coded as: negative=0, neutral=1, positive=2
    """
    df = pd.read_csv(data_path)

    # Ensure sentiments are properly encoded
    sentiment_map = {
        'negative': 0,
        'neutral': 1,
        'positive': 2
    }

    # Map string labels to integers if needed
    if df['sentiment'].dtype == 'object':
        df['label'] = df['sentiment'].map(sentiment_map)
    else:
        df['label'] = df['sentiment']

    # Split dataset
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    # Convert to HuggingFace datasets
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    return train_dataset, val_dataset, test_dataset

# 2. Model and Tokenizer Configuration


def setup_sentiment_model():
    """
    Setup model with QLoRA configuration for sentiment analysis
    """
    # Quantization configuration
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False
    )

    # Load base model and tokenizer
    model_name = "microsoft/deberta-v3-large"  # Good for sentiment analysis
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load model with 3 classes (negative, neutral, positive)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        num_labels=3,
        device_map="auto"
    )

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # LoRA configuration
    lora_config = LoraConfig(
        r=16,                     # Rank dimension
        lora_alpha=32,           # Alpha parameter for scaling
        target_modules=["query", "key", "value"],  # Layers to apply LoRA
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"      # Sequence classification
    )

    # Create PEFT model
    model = get_peft_model(model, lora_config)

    return model, tokenizer

# 3. Data Processing


def preprocess_function(examples, tokenizer):
    """
    Tokenize and prepare inputs
    """
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,          # Adjust based on your needs
        padding="max_length"
    )

# 4. Training Configuration


def setup_training_args(output_dir):
    """
    Setup training arguments with sentiment-specific configurations
    """
    return TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        evaluation_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        push_to_hub=False,
        gradient_accumulation_steps=4,
        fp16=True,
        logging_steps=50,
        warmup_steps=500
    )

# 5. Metrics Computation


def compute_metrics(eval_pred):
    """
    Compute metrics for sentiment analysis evaluation
    """
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        labels, predictions, average='macro'
    )
    accuracy = accuracy_score(labels, predictions)

    # Calculate per-class metrics
    precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
        labels, predictions, average=None, labels=[0, 1, 2]
    )

    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_negative': f1_per_class[0],
        'f1_neutral': f1_per_class[1],
        'f1_positive': f1_per_class[2]
    }

# 6. Main Training Pipeline


def train_sentiment_model(data_path, output_dir):
    """
    Main function to train the sentiment analysis model
    """
    # Prepare dataset
    train_dataset, val_dataset, test_dataset = prepare_sentiment_dataset(
        data_path)

    # Setup model and tokenizer
    model, tokenizer = setup_sentiment_model()

    # Preprocess datasets
    train_dataset = train_dataset.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=train_dataset.column_names
    )
    val_dataset = val_dataset.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=val_dataset.column_names
    )
    test_dataset = test_dataset.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=test_dataset.column_names
    )

    # Setup training arguments
    training_args = setup_training_args(output_dir)

    # Initialize trainer with early stopping
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train model
    trainer.train()

    # Evaluate on test set
    test_results = trainer.evaluate(test_dataset)
    print(f"Test results: {test_results}")

    # Save model and tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model, tokenizer

# 7. Inference Function


def predict_sentiment(text, model, tokenizer):
    """
    Predict sentiment for new text
    """
    # Tokenize input
    inputs = tokenizer(
        text,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt"
    ).to(model.device)

    # Get prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(predictions, dim=1).item()

    # Convert to sentiment label
    sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    confidence = predictions[0][predicted_class].item()

    return {
        'sentiment': sentiment_map[predicted_class],
        'confidence': confidence,
        'probabilities': {
            'negative': predictions[0][0].item(),
            'neutral': predictions[0][1].item(),
            'positive': predictions[0][2].item()
        }
    }


# Example usage
if __name__ == "__main__":
    # Set paths
    # CSV with 'text' and 'sentiment' columns
    DATA_PATH = "path/to/your/sentiment_dataset.csv"
    OUTPUT_DIR = "sentiment_model_output"

    # Train model
    model, tokenizer = train_sentiment_model(DATA_PATH, OUTPUT_DIR)

    # Example prediction
    test_text = "I really enjoyed this product, it exceeded my expectations!"
    result = predict_sentiment(test_text, model, tokenizer)
    print(f"Sentiment analysis result: {result}")