In [None]:
%pip install evaluate huggingface_hub wandb dotenv

In [None]:
import os
import sys
import wandb
import numpy as np
import huggingface_hub
import subprocess
import logging
import json
import pandas as pd
import evaluate
from datasets import Dataset
import transformers
from transformers import (
    RobertaConfig,
    AutoTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from dotenv import load_dotenv
import torch
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

def _init_logger():
    logger = logging.getLogger("model_building")
    logger.setLevel(logging.DEBUG)

    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(logging.DEBUG)

    file_handler = logging.FileHandler("errors.log")
    file_handler.setLevel(logging.ERROR)

    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    console_handler.setFormatter(formatter)
    file_handler.setFormatter(formatter)

    logger.addHandler(console_handler)
    logger.addHandler(file_handler)

def get_tokens():
    """Retrieve all the necessary names, tokens and APIs from environment."""
    try:
        HF_TOKEN = os.getenv("HF_TOKEN")
        wandb_api_key = os.getenv("WANDB_API_KEY")
        project_name = os.getenv("WANDB_PROJECT")
        run_name = os.getenv("WANDB_RUN_NAME")
        artifact_name = os.getenv("ARTIFACT_NAME")

        _logger.debug("Successfully retrieved all the needed tokens.")

        return HF_TOKEN, wandb_api_key, project_name, run_name, artifact_name

    except Exception as e:
        _logger.error("Un unexpected error occurred while retrievening the tokens: %s", e)
        raise

def login(HF_TOKEN, wandb_api_key):
    """Automatically log in to wandb and huggingface."""
    try:
        # Huggingface login
        huggingface_hub.login(HF_TOKEN)
        os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

        # Wandb login
        wandb.login(key=wandb_api_key)

        _logger.debug("Successfully logged in to Hugging Face and W&B.")

    except Exception as e:
        _logger.error("An unexpected error occurred: %s", e)
        raise

def get_config(artifact_name, project_name, run_name):
    """Retrieve model artifact from W&B."""
    try:
        run = wandb.init(project=project_name, name=run_name)
        artifact = run.use_artifact(artifact_name, type="model")

        artifact.download(root="src/model/config", path_prefix="config.json")

        with open("src/model/config/config.json", "r") as f:
            content = f.read()
            config_dict = json.loads(content)

        _logger.debug("Successfully retrieved model configuration.")
        return config_dict

    except Exception as e:
        _logger.error("An unexpected error occurred while retrieving model configuration: %s", e)
        raise



def load_data(file_path):
    """Load data from CSV file."""
    try:
        df = pd.read_csv(file_path)
        df.fillna('', inplace=True) # Fill any NaN values
        dataset = Dataset.from_pandas(df)
        _logger.debug('Data loaded from %s', file_path)
        return dataset
    except pd.errors.ParserError as e:
        _logger.error('Failed to parse the CSV file: %s', e)
        raise
    except Exception as e:
        _logger.error('An unexpected error occurred while loading the data: %s', e)
        raise

def compute_metrics(eval_pred):
    load_accuracy = evaluate.load("accuracy")
    load_f1 = evaluate.load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="macro")["f1"]

    return {"accuracy": accuracy, "f1": f1}

def save_model(trainer, file_path="roberta-model"):
    trainer.save_model("roberta-model")
        
    # Create artifact with metadata
    artifact_name = "roberta-model"
    artifact = wandb.Artifact(artifact_name, type='model')
        
    artifact.add_dir("roberta-model")
    wandb.log_artifact(artifact)
        
    print(f"Model saved as artifact: {artifact_name}")

def main():

    # Load environment variables
    load_dotenv()

    # Clear GPU cache and set reduced precision for matmul operations
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        # Reduce matmul precision to save memory
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True
        # Set reduced precision for matrix multiplications
        torch.set_float32_matmul_precision('medium')

    try:

        # Load training set
        train_dataset = load_data("/kaggle/input/youtube-train-csv/train.csv")

        # Retrieve necessary tokens and names
        HF_TOKEN, wandb_api_key, project_name, run_name, artifact_name = get_tokens()

        login(HF_TOKEN, wandb_api_key)

        # Retrieve model configuration
        config_dict = get_config(artifact_name, project_name, run_name)

        # Initialize model
        _logger.debug("Starting initialization")
        model = RobertaForSequenceClassification.from_pretrained(
            "roberta-base",
            num_labels=3
        )
        _logger.debug("Model has been initialized.")

        # Initialize tokenizer
        tokenizer = AutoTokenizer.from_pretrained("roberta-base")
        _logger.debug("Tokenizer has been initialized.")

        # Tokenize dataset
        def tokenize_function(examples):
            return tokenizer(examples["text"],
                            truncation=True,
                            padding="max_length",
                            max_length=512
                            )

        encoded = train_dataset.map(tokenize_function)
        _logger.debug("Dataset has been tokenized.")

        # Get parameters from config
        lr = config_dict.get("learning_rate", 5e-5)
        batch_size = config_dict.get("per_device_train_batch_size", 32)

        # Add evaluation dataset
        eval_dataset = load_data("/kaggle/input/youtube-train-csv/test.csv")
        encoded_eval = eval_dataset.map(tokenize_function)
        
        training_args = TrainingArguments(
            output_dir="roberta_model",
            num_train_epochs=4,
            per_device_train_batch_size=batch_size,
            learning_rate=lr,
            logging_steps=10,
            save_strategy="epoch",
            fp16=True,  # Use mixed precision training
            dataloader_pin_memory=False,
            remove_unused_columns=True,
            dataloader_num_workers=0,
            save_total_limit=1,
            optim="adamw_torch",
            eval_strategy="epoch", 
            metric_for_best_model="f1",
            load_best_model_at_end=True,
        )
        
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=encoded,
            eval_dataset=encoded_eval, 
            compute_metrics=compute_metrics
        )

        # Train and evaluate model
        trainer.train()
        _logger.debug("Model has been trained.")

        # Generate predictions
        preds_output = trainer.predict(encoded_eval)
        preds = np.argmax(preds_output.predictions, axis=1)
        labels = preds_output.label_ids
        
        # Classification report
        report = classification_report(labels, preds, output_dict=True)
        print(report)
        
        # Confusion matrix
        cm = confusion_matrix(labels, preds)
        
        # Plot and log
        fig, ax = plt.subplots(figsize=(8,6))
        class_names = [str(c) for c in sorted(set(labels))]
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("Confusion Matrix")
        wandb.log({"confusion_matrix": wandb.Image(fig)})
        plt.show()
        
        # Log metrics
        for label, metrics in report.items():
            if isinstance(metrics, dict):
                wandb.log({
                    f"test_{label}_precision": metrics['precision'],
                    f"test_{label}_recall": metrics['recall'],
                    f"test_{label}_f1": metrics['f1-score']
                })

        # Save model
        save_model(trainer)

    except Exception as e:
        # Clear GPU cache in case of error
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        _logger.error("Failed to complete model building: %s", e)
        print(f"Error: {e}")

_init_logger()
_logger = logging.getLogger("model_building")

if __name__ == "__main__":
    main()