# Main Implementation

In [None]:
import os
import torch
from torch import nn
from transformers import AutoTokenizer, BertModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import re
import warnings
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, average_precision_score
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

warnings.filterwarnings('ignore')

def clean_text(text):
    """Basic text cleaning function"""
    text = str(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_title(title):
    """Enhanced title cleaning function"""
    # [Include the full clean_title function as before]
    original_title = str(title)
    # [All the steps from Step 1 to Step 9]
    # [Ensure the complete function is copied here]
    # If cleaning results in an empty title, return the original
    if title.strip() == '':
        return original_title
    else:
        return title

class SingleLabelDataset(Dataset):
    def __init__(self, texts, labels=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)

        return item

class EnhancedClassifier(nn.Module):
    def __init__(self, model_name, num_labels, dropout_rate=0.3):
        super().__init__()
        self.base_model = BertModel.from_pretrained(model_name)
        hidden_size = self.base_model.config.hidden_size

        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )

        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.classifier(x)
        return logits

def compute_class_weights(labels):
    class_counts = np.bincount(labels)
    total = len(labels)
    smoothing_factor = 0.1
    class_weights = total / (
        len(class_counts) * (class_counts + smoothing_factor * total))
    return torch.FloatTensor(class_weights)

class EarlyStopping:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, val_score):
        if self.best_score is None:
            self.best_score = val_score
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = val_score
            self.counter = 0


class TrainingMetrics:
    def __init__(self):
        self.train_losses = []
        self.val_losses = []
        self.train_f1s = []
        self.val_f1s = []
        self.timestamps = []
        self.epoch_times = []
        self.best_val_f1 = 0
        self.best_epoch = 0

    def update(self, train_loss, val_loss, train_f1, val_f1, epoch_time=None):
        self.train_losses.append(train_loss)
        self.val_losses.append(val_loss)
        self.train_f1s.append(train_f1)
        self.val_f1s.append(val_f1)
        self.timestamps.append(datetime.now())
        if epoch_time:
            self.epoch_times.append(epoch_time)

        # Track best performance
        if val_f1 > self.best_val_f1:
            self.best_val_f1 = val_f1
            self.best_epoch = len(self.train_losses) - 1

    def save_metrics(self, task_name, output_dir):
        """Save metrics in both CSV and pickle formats"""
        metrics_dir = os.path.join(output_dir, 'metrics')
        plots_dir = os.path.join(output_dir, 'plots')

        os.makedirs(metrics_dir, exist_ok=True)
        os.makedirs(plots_dir, exist_ok=True)

        # Sanitize task name for filename
        safe_task_name = sanitize_task_name(task_name)

        # Save as CSV for easy viewing/plotting in other tools
        metrics_df = pd.DataFrame({
            'epoch': range(1, len(self.train_losses) + 1),
            'timestamp': self.timestamps,
            'train_loss': self.train_losses,
            'val_loss': self.val_losses,
            'train_f1': self.train_f1s,
            'val_f1': self.val_f1s,
            'epoch_time': self.epoch_times if self.epoch_times else [None] * len(self.train_losses)
        })

        # Add best model information
        summary_df = pd.DataFrame({
            'metric': ['best_val_f1', 'best_epoch', 'total_epochs', 'total_training_time'],
            'value': [
                self.best_val_f1,
                self.best_epoch + 1,
                len(self.train_losses),
                sum(self.epoch_times) if self.epoch_times else None
            ]
        })

        # Save detailed metrics
        metrics_filename = f'{safe_task_name}_metrics.csv'
        csv_path = os.path.join(metrics_dir, metrics_filename)
        metrics_df.to_csv(csv_path, index=False)

        # Save summary
        summary_filename = f'{safe_task_name}_summary.csv'
        summary_path = os.path.join(metrics_dir, summary_filename)
        summary_df.to_csv(summary_path, index=False)

        # Save as pickle for easier loading in Python
        pickle_filename = f'{safe_task_name}_metrics.pkl'
        pickle_path = os.path.join(metrics_dir, pickle_filename)
        with open(pickle_path, 'wb') as f:
            pickle.dump({
                'task_name': task_name,
                'train_losses': self.train_losses,
                'val_losses': self.val_losses,
                'train_f1s': self.train_f1s,
                'val_f1s': self.val_f1s,
                'timestamps': self.timestamps,
                'epoch_times': self.epoch_times,
                'best_val_f1': self.best_val_f1,
                'best_epoch': self.best_epoch
            }, f)

        print(f"\nMetrics saved for {task_name}:")
        print(f"- Detailed metrics: {metrics_filename}")
        print(f"- Summary metrics: {summary_filename}")
        print(f"- Pickle format: {pickle_filename}")


def load_metrics(task_name, output_dir):
    """Load saved metrics from pickle file"""
    pickle_path = os.path.join(output_dir, f'{task_name.lower().replace(" ", "_")}_metrics.pkl')
    with open(pickle_path, 'rb') as f:
        return pickle.load(f)


def sanitize_task_name(task_name):
    """Convert task name to a safe filename format"""
    return task_name.lower().replace(" ", "_").replace("/", "_").replace("-", "_")


def plot_training_curves(metrics, task_name, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Plot losses
    plt.figure(figsize=(10, 5))
    plt.plot(metrics.train_losses, label='Training Loss')
    plt.plot(metrics.val_losses, label='Validation Loss')
    plt.title(f'{task_name} - Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(output_dir, f'{task_name.lower().replace(" ", "_")}_loss.png'))
    plt.close()

    # Plot F1 scores
    plt.figure(figsize=(10, 5))
    plt.plot(metrics.train_f1s, label='Training F1')
    plt.plot(metrics.val_f1s, label='Validation F1')
    plt.title(f'{task_name} - Training and Validation F1 Scores')
    plt.xlabel('Epoch')
    plt.ylabel('F1 Score')
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(output_dir, f'{task_name.lower().replace(" ", "_")}_f1.png'))
    plt.close()

def plot_confusion_matrix(y_true, y_pred, classes, task_name, output_dir):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(15, 15))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=classes, yticklabels=classes)
    plt.title(f'{task_name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{task_name.lower().replace(" ", "_")}_confusion_matrix.png'))
    plt.close()

def plot_class_distribution(labels, classes, task_name, output_dir):
    plt.figure(figsize=(15, 5))
    sns.countplot(y=labels, order=pd.Series(labels).value_counts().index)
    plt.title(f'{task_name} - Class Distribution')
    plt.xlabel('Count')
    plt.ylabel('Class')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{task_name.lower().replace(" ", "_")}_class_distribution.png'))
    plt.close()

def plot_precision_recall_curve(y_true, y_pred_proba, task_name, output_dir):
    n_classes = y_pred_proba.shape[1]
    plt.figure(figsize=(10, 8))

    for i in range(n_classes):
        precision, recall, _ = precision_recall_curve(
            (y_true == i).astype(int),
            y_pred_proba[:, i]
        )
        plt.plot(recall, precision, lw=2,
                 label=f'Class {i} (AP = {average_precision_score((y_true == i).astype(int), y_pred_proba[:, i]):.2f})')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{task_name} - Precision-Recall Curves')
    plt.legend(loc='best', bbox_to_anchor=(1.05, 1))
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{task_name.lower().replace(" ", "_")}_pr_curve.png'))
    plt.close()


# Add a utility function to plot metrics from saved files
def plot_saved_metrics(task_name, output_dir='training_plots'):
    """Plot metrics from saved files"""
    metrics = load_metrics(task_name, output_dir)

    # Create figure with subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    # Plot losses
    ax1.plot(metrics['train_losses'], label='Training Loss')
    ax1.plot(metrics['val_losses'], label='Validation Loss')
    ax1.set_title(f'{task_name} - Training and Validation Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    ax1.grid(True)

    # Plot F1 scores
    ax2.plot(metrics['train_f1s'], label='Training F1')
    ax2.plot(metrics['val_f1s'], label='Validation F1')
    ax2.set_title(f'{task_name} - Training and Validation F1 Scores')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('F1 Score')
    ax2.legend()
    ax2.grid(True)

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{task_name.lower().replace(" ", "_")}_metrics_plot.png'))
    plt.close()

    # Print summary statistics
    print(f"\nTraining Summary for {task_name}:")
    print(f"Best Validation F1: {metrics['best_val_f1']:.4f} (Epoch {metrics['best_epoch'] + 1})")
    if metrics['epoch_times']:
        avg_epoch_time = sum(metrics['epoch_times']) / len(metrics['epoch_times'])
        print(f"Average Epoch Time: {avg_epoch_time:.2f}s")


def train_model(model, train_loader, val_loader, class_weights,
                task_name, num_epochs=10, learning_rate=2e-5,
                device='cuda', patience=3, gradient_accumulation_steps=1,
                output_dir='training_results'):
    print(f"\nStarting training for: {task_name}")

    model = model.to(device)
    metrics = TrainingMetrics()

    plots_dir = os.path.join(output_dir, 'plots')
    metrics_dir = os.path.join(output_dir, 'metrics')
    os.makedirs(plots_dir, exist_ok=True)
    os.makedirs(metrics_dir, exist_ok=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate,
                                  weight_decay=0.01)

    total_steps = len(train_loader) // gradient_accumulation_steps * num_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
    early_stopping = EarlyStopping(patience=patience)

    best_val_f1 = 0
    best_state = None
    scaler = torch.cuda.amp.GradScaler()

    for epoch in range(num_epochs):
        epoch_start_time = datetime.now()
        model.train()
        train_loss = 0
        train_preds = []
        train_labels = []
        all_train_probs = []

        for step, batch in enumerate(tqdm(train_loader,
                                          desc=f'{task_name} Epoch {epoch + 1}/{num_epochs}')):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with torch.cuda.amp.autocast():
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                loss = loss / gradient_accumulation_steps

            scaler.scale(loss).backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            train_loss += loss.item() * gradient_accumulation_steps

            probs = torch.softmax(logits, dim=1)
            _, preds = torch.max(logits, 1)
            train_preds.extend(preds.detach().cpu().numpy())
            train_labels.extend(labels.detach().cpu().numpy())
            all_train_probs.extend(probs.detach().cpu().numpy())

        model.eval()
        val_loss = 0
        val_preds = []
        val_labels = []
        all_val_probs = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                with torch.cuda.amp.autocast():
                    logits = model(input_ids, attention_mask)
                    loss = criterion(logits, labels)

                val_loss += loss.item()

                probs = torch.softmax(logits, dim=1)
                _, preds = torch.max(logits, 1)
                val_preds.extend(preds.detach().cpu().numpy())
                val_labels.extend(labels.detach().cpu().numpy())
                all_val_probs.extend(probs.detach().cpu().numpy())

        train_f1 = f1_score(train_labels, train_preds, average='macro')
        val_f1 = f1_score(val_labels, val_preds, average='macro')

        epoch_time = (datetime.now() - epoch_start_time).total_seconds()

        # Update metrics
        metrics.update(
            train_loss / len(train_loader),
            val_loss / len(val_loader),
            train_f1,
            val_f1,
            epoch_time
        )

        print(f'\n{task_name} - Epoch {epoch + 1}')
        print(f'Training Loss: {metrics.train_losses[-1]:.4f}')
        print(f'Validation Loss: {metrics.val_losses[-1]:.4f}')
        print(f'Training F1: {train_f1:.4f}')
        print(f'Validation F1: {val_f1:.4f}')
        print(f'Epoch Time: {epoch_time:.2f}s')

        # Generate plots at each epoch
        plot_training_curves(metrics, task_name, plots_dir)

        if epoch == num_epochs - 1 or early_stopping.early_stop:
            # Save metrics
            metrics.save_metrics(task_name, output_dir)

            # Generate final plots
            plot_precision_recall_curve(
                np.array(val_labels),
                np.array(all_val_probs),
                task_name,
                plots_dir
            )

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_state = model.state_dict().copy()

        early_stopping(val_f1)
        if early_stopping.early_stop:
            print(f"Early stopping triggered at epoch {epoch + 1}")
            break

    print(f"\nSaving final metrics for: {task_name}")
    metrics.save_metrics(task_name, output_dir)

    return best_state

def generate_predictions(model, data_loader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Generating predictions"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            with torch.cuda.amp.autocast():
                logits = model(input_ids, attention_mask)
            _, preds = torch.max(logits, dim=1)
            predictions.extend(preds.detach().cpu().numpy())

    return np.array(predictions)

def main(model_name='bert-base-uncased', subtask=1, epochs=5, batch_size=16, learning_rate=2e-5,
         max_length=128, gradient_accumulation_steps=1, stratification=True, synthetic_data_suffix='25'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Create base output directory
    base_output_dir = 'training_results'

    if subtask == 1:
        hazard_df = pd.read_csv(f'hazard_category_data_{synthetic_data_suffix}.csv')
        product_df = pd.read_csv(f'product_category_data_{synthetic_data_suffix}.csv')

        hazard_col = 'hazard-category'
        product_col = 'product-category'
        output_prefix = 'st1'
        hazard_task_name = "ST1 Hazard Category Classification"
        product_task_name = "ST1 Product Category Classification"
        output_dir = os.path.join(base_output_dir, 'st1_results')
    else:
        hazard_df = pd.read_csv(f'hazard_data_{synthetic_data_suffix}.csv')
        product_df = pd.read_csv(f'product_data_{synthetic_data_suffix}.csv')

        hazard_col = 'hazard'
        product_col = 'product'
        output_prefix = 'st2'
        hazard_task_name = "ST2 Hazard Vector Classification"
        product_task_name = "ST2 Product Vector Classification"
        output_dir = os.path.join(base_output_dir, 'st2_results')

    # Create task-specific output directories
    plots_dir = os.path.join(output_dir, 'plots')
    metrics_dir = os.path.join(output_dir, 'metrics')
    models_dir = os.path.join(output_dir, 'models')

    for directory in [plots_dir, metrics_dir, models_dir]:
        os.makedirs(directory, exist_ok=True)

    print(f"\nStarting {output_prefix.upper()} training...")
    print(f"Hazard task: {hazard_task_name}")
    print(f"Product task: {product_task_name}")
    print(f"Output directory: {output_dir}")

    hazard_df['clean_title'] = hazard_df['title'].apply(clean_title)
    product_df['clean_title'] = product_df['title'].apply(clean_title)

    hazard_encoder = LabelEncoder()
    product_encoder = LabelEncoder()

    hazard_df['label'] = hazard_encoder.fit_transform(hazard_df[hazard_col])
    product_df['label'] = product_encoder.fit_transform(product_df[product_col])

    # Save encoders in the metrics directory
    encoder_file = os.path.join(metrics_dir, f'{output_prefix}_encoders.pkl')
    with open(encoder_file, 'wb') as f:
        pickle.dump({
            'hazard_encoder': hazard_encoder,
            'product_encoder': product_encoder
        }, f)

    plot_class_distribution(
        hazard_df['label'].values,
        hazard_encoder.classes_,
        hazard_task_name,
        plots_dir
    )
    plot_class_distribution(
        product_df['label'].values,
        product_encoder.classes_,
        product_task_name,
        plots_dir
    )

    hazard_train, hazard_val = train_test_split(
        hazard_df, test_size=0.1, random_state=42,
        stratify=hazard_df[hazard_col] if stratification else None
    )
    product_train, product_val = train_test_split(
        product_df, test_size=0.1, random_state=42,
        stratify=product_df[product_col] if stratification else None
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    hazard_train_dataset = SingleLabelDataset(
        texts=hazard_train['clean_title'].values,
        labels=hazard_train['label'].values,
        tokenizer=tokenizer,
        max_length=max_length
    )
    hazard_val_dataset = SingleLabelDataset(
        texts=hazard_val['clean_title'].values,
        labels=hazard_val['label'].values,
        tokenizer=tokenizer,
        max_length=max_length
    )
    product_train_dataset = SingleLabelDataset(
        texts=product_train['clean_title'].values,
        labels=product_train['label'].values,
        tokenizer=tokenizer,
        max_length=max_length
    )
    product_val_dataset = SingleLabelDataset(
        texts=product_val['clean_title'].values,
        labels=product_val['label'].values,
        tokenizer=tokenizer,
        max_length=max_length
    )

    hazard_train_loader = DataLoader(
        hazard_train_dataset,
        batch_size=batch_size,
        shuffle=True
    )
    hazard_val_loader = DataLoader(
        hazard_val_dataset,
        batch_size=batch_size
    )
    product_train_loader = DataLoader(
        product_train_dataset,
        batch_size=batch_size,
        shuffle=True
    )
    product_val_loader = DataLoader(
        product_val_dataset,
        batch_size=batch_size
    )

    hazard_weights = compute_class_weights(hazard_train['label'].values)
    product_weights = compute_class_weights(product_train['label'].values)

    hazard_model = EnhancedClassifier(model_name=model_name, num_labels=len(hazard_encoder.classes_))
    product_model = EnhancedClassifier(model_name=model_name, num_labels=len(product_encoder.classes_))

    print(f"\nTraining {hazard_task_name}...")
    hazard_best_state = train_model(
        hazard_model, hazard_train_loader, hazard_val_loader,
        hazard_weights, hazard_task_name,
        num_epochs=epochs,
        learning_rate=learning_rate,
        device=device,
        gradient_accumulation_steps=gradient_accumulation_steps,
        output_dir=output_dir  # Pass output directory to train_model
    )

    print(f"\nTraining {product_task_name}...")
    product_best_state = train_model(
        product_model, product_train_loader, product_val_loader,
        product_weights, product_task_name,
        num_epochs=epochs,
        learning_rate=learning_rate,
        device=device,
        gradient_accumulation_steps=gradient_accumulation_steps,
        output_dir=output_dir  # Pass output directory to train_model
    )

    # Save models in the models directory
    torch.save(hazard_best_state,
              os.path.join(models_dir, f'best_{output_prefix}_hazard_model.pt'))
    torch.save(product_best_state,
              os.path.join(models_dir, f'best_{output_prefix}_product_model.pt'))

    # Generate predictions if validation file exists
    if os.path.exists('incidents_unlabeled_val.csv'):
        val_df = pd.read_csv('incidents_unlabeled_val.csv')
        val_df['clean_title'] = val_df['title'].apply(clean_title)

        val_dataset = SingleLabelDataset(
            texts=val_df['clean_title'].values,
            tokenizer=tokenizer,
            max_length=max_length
        )
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        hazard_model.load_state_dict(
            torch.load(os.path.join(models_dir, f'best_{output_prefix}_hazard_model.pt')))
        product_model.load_state_dict(
            torch.load(os.path.join(models_dir, f'best_{output_prefix}_product_model.pt')))

        hazard_model = hazard_model.to(device)
        product_model = product_model.to(device)

        print("\nGenerating predictions...")
        hazard_preds = generate_predictions(hazard_model, val_loader, device)
        product_preds = generate_predictions(product_model, val_loader, device)

        hazard_preds_str = hazard_encoder.inverse_transform(hazard_preds)
        product_preds_str = product_encoder.inverse_transform(product_preds)

        submission_df = pd.DataFrame({
            hazard_col: hazard_preds_str,
            product_col: product_preds_str
        })

        # Save predictions in the metrics directory
        submission_path = os.path.join(metrics_dir, 'submission.csv')
        submission_df.to_csv(submission_path, index=False)

        print(f"\nSubmission file saved to {submission_path}")

        # Create ZIP file
        import zipfile
        zip_path = os.path.join(output_dir, f'submission_{output_prefix}.zip')
        with zipfile.ZipFile(zip_path, 'w') as zipf:
            zipf.write(submission_path, arcname='submission.csv')

        print(f"\n{hazard_col} predictions distribution:")
        print(submission_df[hazard_col].value_counts().head())
        print(f"\n{product_col} predictions distribution:")
        print(submission_df[product_col].value_counts().head())

        print("\nPrediction Statistics:")
        print(f"Total predictions: {len(submission_df)}")
        print(
            f"Unique {hazard_col} categories: {len(submission_df[hazard_col].unique())}")
        print(
            f"Unique {product_col} categories: {len(submission_df[product_col].unique())}")

# Set Seed

In [None]:
# Set random seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Run ST1

In [None]:
# Run main function with desired parameters
main(model_name='bert-base-uncased', subtask=1, epochs=15, batch_size=16, learning_rate=2e-5,
     max_length=128, gradient_accumulation_steps=1, stratification=True, synthetic_data_suffix='25_augmented')

Using device: cuda

Starting ST1 training...
Hazard task: ST1 Hazard Category Classification
Product task: ST1 Product Category Classification
Output directory: training_results/st1_results

Generating predictions...


Generating predictions:   0%|          | 0/36 [00:00<?, ?it/s]

Generating predictions:   0%|          | 0/36 [00:00<?, ?it/s]


Submission file saved to training_results/st1_results/metrics/submission.csv

hazard-category predictions distribution:
hazard-category
allergens         211
biological        204
foreign bodies     64
fraud              32
chemical           29
Name: count, dtype: int64

product-category predictions distribution:
product-category
meat, egg and dairy products            146
cereals and bakery products              75
fruits and vegetables                    56
prepared dishes and snacks               55
soups, broths, sauces and condiments     33
Name: count, dtype: int64

Prediction Statistics:
Total predictions: 565
Unique hazard-category categories: 9
Unique product-category categories: 19


# Run ST2

In [None]:
# Run main function with desired parameters
main(subtask=2, epochs=15, batch_size=32, learning_rate=3e-5,
     max_length=100, gradient_accumulation_steps=1, synthetic_data_suffix='25_augmented')

Using device: cuda

Starting ST2 training...
Hazard task: ST2 Hazard Vector Classification
Product task: ST2 Product Vector Classification
Output directory: training_results/st2_results

Training ST2 Hazard Vector Classification...

Starting training for: ST2 Hazard Vector Classification


ST2 Hazard Vector Classification Epoch 1/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 1
Training Loss: 4.6834
Validation Loss: 4.2293
Training F1: 0.0065
Validation F1: 0.0060
Epoch Time: 22.15s


ST2 Hazard Vector Classification Epoch 2/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 2
Training Loss: 3.7966
Validation Loss: 3.2135
Training F1: 0.0630
Validation F1: 0.1835
Epoch Time: 22.16s


ST2 Hazard Vector Classification Epoch 3/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 3
Training Loss: 2.9434
Validation Loss: 2.6360
Training F1: 0.2765
Validation F1: 0.4167
Epoch Time: 22.11s


ST2 Hazard Vector Classification Epoch 4/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 4
Training Loss: 2.3853
Validation Loss: 2.2600
Training F1: 0.4810
Validation F1: 0.5663
Epoch Time: 22.88s


ST2 Hazard Vector Classification Epoch 5/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 5
Training Loss: 1.9470
Validation Loss: 1.9617
Training F1: 0.6063
Validation F1: 0.6083
Epoch Time: 22.11s


ST2 Hazard Vector Classification Epoch 6/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 6
Training Loss: 1.5970
Validation Loss: 1.7869
Training F1: 0.6916
Validation F1: 0.6582
Epoch Time: 22.46s


ST2 Hazard Vector Classification Epoch 7/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 7
Training Loss: 1.3234
Validation Loss: 1.6741
Training F1: 0.7497
Validation F1: 0.6755
Epoch Time: 22.91s


ST2 Hazard Vector Classification Epoch 8/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 8
Training Loss: 1.1005
Validation Loss: 1.5713
Training F1: 0.7826
Validation F1: 0.6776
Epoch Time: 22.14s


ST2 Hazard Vector Classification Epoch 9/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 9
Training Loss: 0.9330
Validation Loss: 1.4977
Training F1: 0.8144
Validation F1: 0.7001
Epoch Time: 22.14s


ST2 Hazard Vector Classification Epoch 10/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 10
Training Loss: 0.7839
Validation Loss: 1.4746
Training F1: 0.8502
Validation F1: 0.7109
Epoch Time: 22.17s


ST2 Hazard Vector Classification Epoch 11/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 11
Training Loss: 0.6744
Validation Loss: 1.4463
Training F1: 0.8764
Validation F1: 0.7230
Epoch Time: 22.14s


ST2 Hazard Vector Classification Epoch 12/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 12
Training Loss: 0.5940
Validation Loss: 1.4279
Training F1: 0.8988
Validation F1: 0.7168
Epoch Time: 22.16s


ST2 Hazard Vector Classification Epoch 13/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 13
Training Loss: 0.5305
Validation Loss: 1.4103
Training F1: 0.9157
Validation F1: 0.7232
Epoch Time: 22.14s


ST2 Hazard Vector Classification Epoch 14/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 14
Training Loss: 0.4860
Validation Loss: 1.4082
Training F1: 0.9248
Validation F1: 0.7201
Epoch Time: 22.18s


ST2 Hazard Vector Classification Epoch 15/15:   0%|          | 0/188 [00:00<?, ?it/s]


ST2 Hazard Vector Classification - Epoch 15
Training Loss: 0.4540
Validation Loss: 1.4063
Training F1: 0.9366
Validation F1: 0.7173
Epoch Time: 22.20s

Metrics saved for ST2 Hazard Vector Classification:
- Detailed metrics: st2_hazard_vector_classification_metrics.csv
- Summary metrics: st2_hazard_vector_classification_summary.csv
- Pickle format: st2_hazard_vector_classification_metrics.pkl

Saving final metrics for: ST2 Hazard Vector Classification

Metrics saved for ST2 Hazard Vector Classification:
- Detailed metrics: st2_hazard_vector_classification_metrics.csv
- Summary metrics: st2_hazard_vector_classification_summary.csv
- Pickle format: st2_hazard_vector_classification_metrics.pkl

Training ST2 Product Vector Classification...

Starting training for: ST2 Product Vector Classification


ST2 Product Vector Classification Epoch 1/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 1
Training Loss: 6.8967
Validation Loss: 6.6054
Training F1: 0.0013
Validation F1: 0.0041
Epoch Time: 86.81s


ST2 Product Vector Classification Epoch 2/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 2
Training Loss: 6.1755
Validation Loss: 5.4947
Training F1: 0.0363
Validation F1: 0.1405
Epoch Time: 86.98s


ST2 Product Vector Classification Epoch 3/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 3
Training Loss: 5.1213
Validation Loss: 4.5055
Training F1: 0.2215
Validation F1: 0.4245
Epoch Time: 86.98s


ST2 Product Vector Classification Epoch 4/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 4
Training Loss: 4.1858
Validation Loss: 3.6588
Training F1: 0.4548
Validation F1: 0.5972
Epoch Time: 87.04s


ST2 Product Vector Classification Epoch 5/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 5
Training Loss: 3.3752
Validation Loss: 2.9496
Training F1: 0.6171
Validation F1: 0.6954
Epoch Time: 87.46s


ST2 Product Vector Classification Epoch 6/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 6
Training Loss: 2.6864
Validation Loss: 2.3728
Training F1: 0.7322
Validation F1: 0.7593
Epoch Time: 87.35s


ST2 Product Vector Classification Epoch 7/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 7
Training Loss: 2.1161
Validation Loss: 1.9261
Training F1: 0.8082
Validation F1: 0.7960
Epoch Time: 87.25s


ST2 Product Vector Classification Epoch 8/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 8
Training Loss: 1.6740
Validation Loss: 1.6027
Training F1: 0.8606
Validation F1: 0.8198
Epoch Time: 87.11s


ST2 Product Vector Classification Epoch 9/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 9
Training Loss: 1.3262
Validation Loss: 1.3520
Training F1: 0.8937
Validation F1: 0.8364
Epoch Time: 87.25s


ST2 Product Vector Classification Epoch 10/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 10
Training Loss: 1.0700
Validation Loss: 1.1830
Training F1: 0.9213
Validation F1: 0.8518
Epoch Time: 87.20s


ST2 Product Vector Classification Epoch 11/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 11
Training Loss: 0.8818
Validation Loss: 1.0611
Training F1: 0.9370
Validation F1: 0.8594
Epoch Time: 87.19s


ST2 Product Vector Classification Epoch 12/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 12
Training Loss: 0.7435
Validation Loss: 0.9830
Training F1: 0.9525
Validation F1: 0.8567
Epoch Time: 87.18s


ST2 Product Vector Classification Epoch 13/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 13
Training Loss: 0.6488
Validation Loss: 0.9299
Training F1: 0.9588
Validation F1: 0.8634
Epoch Time: 87.22s


ST2 Product Vector Classification Epoch 14/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 14
Training Loss: 0.5861
Validation Loss: 0.8988
Training F1: 0.9657
Validation F1: 0.8651
Epoch Time: 87.22s


ST2 Product Vector Classification Epoch 15/15:   0%|          | 0/740 [00:00<?, ?it/s]


ST2 Product Vector Classification - Epoch 15
Training Loss: 0.5493
Validation Loss: 0.8870
Training F1: 0.9702
Validation F1: 0.8669
Epoch Time: 87.16s

Metrics saved for ST2 Product Vector Classification:
- Detailed metrics: st2_product_vector_classification_metrics.csv
- Summary metrics: st2_product_vector_classification_summary.csv
- Pickle format: st2_product_vector_classification_metrics.pkl

Saving final metrics for: ST2 Product Vector Classification

Metrics saved for ST2 Product Vector Classification:
- Detailed metrics: st2_product_vector_classification_metrics.csv
- Summary metrics: st2_product_vector_classification_summary.csv
- Pickle format: st2_product_vector_classification_metrics.pkl

Generating predictions...


Generating predictions:   0%|          | 0/18 [00:00<?, ?it/s]

Generating predictions:   0%|          | 0/18 [00:00<?, ?it/s]


Submission file saved to training_results/st2_results/metrics/submission.csv

hazard predictions distribution:
hazard
listeria monocytogenes          92
salmonella                      79
milk and products thereof       67
peanuts and products thereof    33
plastic fragment                27
Name: count, dtype: int64

product predictions distribution:
product
chicken based products       37
ice cream                    18
cookies                      12
ready to eat - cook meals    11
cheese                       11
Name: count, dtype: int64

Prediction Statistics:
Total predictions: 565
Unique hazard categories: 71
Unique product categories: 281


## Deleting training_results folder (only uncomment if necessary)

In [None]:
# import shutil
# shutil.rmtree('training_results')

import zipfile


In [None]:
import shutil
shutil.make_archive('training_results', 'zip', 'training_results')

'/content/training_results.zip'