In [None]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import json


class IntentClassifier:
    def __init__(self, model_name="microsoft/deberta-v2-xlarge", output_dir="intent_model"):
        self.model_name = model_name
        self.output_dir = output_dir
        self.label_encoder = LabelEncoder()

    def prepare_intent_dataset(self, data_path):
        """
        Prepare dataset from a CSV file containing 'text' and 'intent' columns
        """
        df = pd.read_csv(data_path)

        # Encode intent labels
        self.label_encoder.fit(df['intent'])
        df['label'] = self.label_encoder.transform(df['intent'])

        # Save label encoder classes
        self.num_labels = len(self.label_encoder.classes_)
        with open(os.path.join(self.output_dir, 'intent_labels.json'), 'w') as f:
            json.dump(
                {
                    'labels': self.label_encoder.classes_.tolist(),
                    'label_to_id': {l: i for i, l in enumerate(self.label_encoder.classes_)}
                },
                f
            )

        # Split dataset
        train_df, temp_df = train_test_split(
            df, test_size=0.3, random_state=42, stratify=df['label'])
        val_df, test_df = train_test_split(
            temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

        # Convert to HuggingFace datasets
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        test_dataset = Dataset.from_pandas(test_df)

        return train_dataset, val_dataset, test_dataset

    def setup_intent_model(self):
        """
        Setup model with QLoRA configuration for intent classification
        """
        # Quantization configuration
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=False
        )

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)

        # Load model
        model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            quantization_config=bnb_config,
            num_labels=self.num_labels,
            device_map="auto"
        )

        # Prepare model for k-bit training
        model = prepare_model_for_kbit_training(model)

        # LoRA configuration
        lora_config = LoraConfig(
            r=16,                     # Rank dimension
            lora_alpha=32,           # Alpha parameter for scaling
            target_modules=["query", "key", "value"],
            lora_dropout=0.05,
            bias="none",
            task_type="SEQ_CLS"
        )

        # Create PEFT model
        model = get_peft_model(model, lora_config)
        model.print_trainable_parameters()

        return model

    def preprocess_function(self, examples):
        """
        Tokenize and prepare inputs
        """
        return self.tokenizer(
            examples["text"],
            truncation=True,
            max_length=128,
            padding="max_length"
        )

    def compute_metrics(self, eval_pred):
        """
        Compute metrics for intent classification evaluation
        """
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)

        # Get classification report
        report = classification_report(
            labels,
            predictions,
            target_names=self.label_encoder.classes_,
            output_dict=True
        )

        # Extract metrics
        metrics = {
            'accuracy': report['accuracy'],
            'macro_f1': report['macro avg']['f1-score'],
            'weighted_f1': report['weighted avg']['f1-score']
        }

        # Add per-intent f1 scores
        for intent in self.label_encoder.classes_:
            metrics[f'f1_{intent}'] = report[intent]['f1-score']

        return metrics

    def setup_training_args(self):
        """
        Setup training arguments optimized for intent classification
        """
        return TrainingArguments(
            output_dir=self.output_dir,
            learning_rate=2e-4,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=10,
            weight_decay=0.01,
            evaluation_strategy="steps",
            eval_steps=50,
            save_strategy="steps",
            save_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="weighted_f1",
            push_to_hub=False,
            gradient_accumulation_steps=2,
            fp16=True,
            logging_steps=10,
            warmup_ratio=0.1,
            lr_scheduler_type="cosine",
            report_to="tensorboard"
        )

    def train(self, data_path):
        """
        Main training function
        """
        # Create output directory
        os.makedirs(self.output_dir, exist_ok=True)

        # Prepare datasets
        train_dataset, val_dataset, test_dataset = self.prepare_intent_dataset(
            data_path)

        # Setup model
        model = self.setup_intent_model()

        # Preprocess datasets
        train_dataset = train_dataset.map(
            self.preprocess_function,
            batched=True,
            remove_columns=train_dataset.column_names
        )
        val_dataset = val_dataset.map(
            self.preprocess_function,
            batched=True,
            remove_columns=val_dataset.column_names
        )
        test_dataset = test_dataset.map(
            self.preprocess_function,
            batched=True,
            remove_columns=test_dataset.column_names
        )

        # Setup training arguments
        training_args = self.setup_training_args()

        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=self.compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
        )

        # Train model
        trainer.train()

        # Evaluate on test set
        test_results = trainer.evaluate(test_dataset)

        # Save results
        with open(os.path.join(self.output_dir, 'test_results.json'), 'w') as f:
            json.dump(test_results, f, indent=4)

        # Save model and tokenizer
        trainer.save_model(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)

        return test_results

    def predict_intent(self, text):
        """
        Predict intent for new text
        """
        # Load the saved model and tokenizer
        model = AutoModelForSequenceClassification.from_pretrained(
            self.output_dir)

        # Tokenize input
        inputs = self.tokenizer(
            text,
            truncation=True,
            max_length=128,
            padding="max_length",
            return_tensors="pt"
        ).to(model.device)

        # Get prediction
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.softmax(outputs.logits, dim=1)
            predicted_class = torch.argmax(predictions, dim=1).item()

        # Get intent label and confidence
        predicted_intent = self.label_encoder.inverse_transform([predicted_class])[
            0]
        confidence = predictions[0][predicted_class].item()

        # Get probabilities for all intents
        intent_probabilities = {
            intent: predictions[0][i].item()
            for i, intent in enumerate(self.label_encoder.classes_)
        }

        return {
            'intent': predicted_intent,
            'confidence': confidence,
            'probabilities': intent_probabilities
        }

# Example usage


def main():
    # Set paths
    # CSV with 'text' and 'intent' columns
    DATA_PATH = "path/to/your/intent_dataset.csv"
    OUTPUT_DIR = "intent_classifier_output"

    # Initialize and train classifier
    classifier = IntentClassifier(output_dir=OUTPUT_DIR)
    test_results = classifier.train(DATA_PATH)
    print(f"Test results: {test_results}")

    # Example prediction
    test_text = "What is the weather like today?"
    result = classifier.predict_intent(test_text)
    print(f"Predicted intent: {result}")


if __name__ == "__main__":
    main()