<a href="https://colab.research.google.com/github/steliosg23/PDS-A2/blob/main/SUBMISSION%20Finetuned%20PubMedBERT%20PDS%20A2%20Food%20Hazard%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary packages and import libraries
This section includes all the necessary imports for data manipulation, model training, and evaluation.
It also imports libraries for handling tokenization, model configuration, and metrics.


In [1]:
from google.colab import drive
import pandas as pd
import torch
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import os
from shutil import make_archive
import numpy as np


# Mount Google Drive


In [2]:
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load and preview the training dataset
The dataset containing incident reports is loaded from Google Drive.
We remove any unnecessary columns like 'Unnamed: 0'.


In [3]:
from google.colab import drive
import pandas as pd


# Define the path to the file on Google Drive
train_path = '/content/drive/MyDrive/Data/augmented_incidents_train.csv'

# Load the dataset
df = pd.read_csv(train_path)

# Keep only the specified columns
columns_to_keep = ['year', 'month', 'day', 'country', 'title', 'text', 'hazard-category', 'product-category', 'hazard', 'product']
df = df[columns_to_keep]
df

Unnamed: 0,year,month,day,country,title,text,hazard-category,product-category,hazard,product
0,2008,11,7,au,Country Cuisine Pty Ltd—Malouf’s Spice Mezza,"PRA No. 2008/10424 Date Published Nov 7, 2008 ...",allergens,herbs and spices,nuts,spice mix
1,2011,7,11,au,Haigh's Manufacturing Pty Ltd—Haigh’s Aprichoc...,"PRA No. 2011/12730 Publication Date Jul 11, 20...",allergens,"cocoa and cocoa preparations, coffee and tea",nuts,chocolate
2,2012,2,21,au,Coles Supermarkets Limited—Coles Deli 200g Spi...,"PRA No. 2012/13032 Publication Date Feb 21, 20...",allergens,"soups, broths, sauces and condiments",nuts,sauce
3,2012,12,4,us,2012 - Price Chopper Supermarkets Recalls Cent...,"FOR IMMEDIATE RELEASE - October 21, 2012 - (Sc...",allergens,cereals and bakery products,nuts,cakes
4,2014,4,10,au,Coles Easter Eggs,Coles Supermarkets Australia Pty Ltd recalled ...,allergens,"cocoa and cocoa preparations, coffee and tea",nuts,chocolate eggs
...,...,...,...,...,...,...,...,...,...,...
11834,2016,9,29,au,Quality Bakers Australia Pty Limited — Various...,PRA No. 2016/15657 Date published 29 Sep 2016 ...,foreign bodies,cereals and bakery products,metal fragment,bread
11835,2016,8,18,au,Gluten Free Bakehouse Pty Ltd — Various Zehnde...,PRA No. 2016/15603 Date published 18 Aug 2016 ...,allergens,cereals and bakery products,soybeans and products thereof,bread
11836,2018,11,30,us,Tres Hermanos Bakery Issues Allergy Alert on U...,"Wyoming, MI - Tres Hermanos Bakery of Wyoming,...",allergens,cereals and bakery products,milk and products thereof,bread
11837,2005,9,27,au,Gold Coast Bakery Queensland Pty Ltd—Vogels—Fr...,PRA No. 2005/8073 Date published 27 Sep 2005 P...,foreign bodies,cereals and bakery products,plastic fragment,bread


# Define a function to clean text data
This function removes special characters, converts text to lowercase, and strips extra whitespace.
It is essential to clean the text data for better model performance.


In [4]:
import re

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert text to lowercase
    text = ' '.join(text.split())  # Remove extra spaces
    return text


# Clean the text data and load the tokenizer
We apply the `clean_text` function to clean the 'text' column of the dataset.
Then, we initialize the PubMedBERT tokenizer to prepare for tokenization.


In [5]:
# Load the tokenizer for the PubMedBERT model, specifically fine-tuned for biomedical text
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

# Apply the text cleaning function to the 'text' column in the DataFrame
# This function will preprocess each text entry by removing unwanted characters, stopwords, etc.
df['text'] = df['text'].apply(clean_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Define features and targets for classification tasks
We specify the input features like date and country and set the classification targets.



In [6]:
# Define the features for the model, which include the year, month, day, and country information
features = ['year', 'month', 'day', 'country']

# Define the target variables for Subtask 1, which are the hazard-category and product-category
targets_subtask1 = ['hazard-category', 'product-category']

# Define the target variables for Subtask 2, which are hazard and product
# Add other targets if necessary depending on the task
targets_subtask2 = ['hazard', 'product']


# Encode target labels
For classification, target labels need to be encoded as numeric values.
We use `LabelEncoder` to convert categorical labels into integers.


In [7]:
# Create an empty dictionary to store label encoders for each target
label_encoders = {}

# Iterate over both sets of targets (Subtask 1 and Subtask 2)
for target in targets_subtask1 + targets_subtask2:
    # Initialize a LabelEncoder for each target
    le = LabelEncoder()

    # Transform the target column values into numeric labels and update the DataFrame
    df[target] = le.fit_transform(df[target])

    # Store the fitted LabelEncoder in the dictionary for future use (e.g., inverse transformation)
    label_encoders[target] = le


# Define a custom PyTorch dataset for text classification
This dataset class will handle text tokenization and label processing.
It ensures the text is properly encoded, padded, and truncated to a fixed length for the model.


In [8]:
# Define a custom Dataset class for text data
class TextDataset(Dataset):
    # Initialize the dataset with texts, labels, tokenizer, and maximum sequence length
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts  # List of input texts
        self.labels = labels  # List of corresponding labels
        self.tokenizer = tokenizer  # Tokenizer for encoding the text
        self.max_len = max_len  # Maximum length for padding/truncation

    # Define the length of the dataset (number of samples)
    def __len__(self):
        return len(self.texts)

    # Define how to retrieve a single item from the dataset
    def __getitem__(self, item):
        text = str(self.texts[item])  # Get the text for the given index
        label = self.labels[item]  # Get the label for the given index

        # Use the tokenizer to encode the text (add special tokens, padding, truncation)
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add special tokens (e.g., [CLS], [SEP])
            max_length=self.max_len,  # Limit the sequence length
            padding='max_length',  # Pad sequences to max_length
            truncation=True,  # Truncate longer sequences
            return_tensors='pt'  # Return PyTorch tensors
        )

        # Return a dictionary with input_ids, attention_mask, and label
        return {
            'input_ids': encoding['input_ids'].flatten(),  # Flatten the tensor
            'attention_mask': encoding['attention_mask'].flatten(),  # Flatten the attention mask
            'label': torch.tensor(label, dtype=torch.long)  # Convert label to a tensor
        }


# Split the data into training and testing sets
We split the dataset into training and testing sets for each target.
This ensures that the model is trained on one set and evaluated on a separate, unseen set.


In [9]:
# Define a function to prepare data for model training and testing
def prepare_data(text_column):
    # Extract features and text column from the DataFrame
    X = df[features + [text_column]]  # Features include specified columns plus the text column
    # Extract target variables for Subtask 1 and Subtask 2
    y_subtask1 = df[targets_subtask1]
    y_subtask2 = df[targets_subtask2]

    # Initialize a dictionary to store data splits for each target
    data_splits = {}

    # Iterate over both sets of target variables (Subtask 1 and Subtask 2)
    for target in targets_subtask1 + targets_subtask2:
        # Split the data into training and testing sets (80% train, 20% test)
        X_train, X_test, y_train, y_test = train_test_split(
            X, df[target], test_size=0.2, random_state=42
        )

        # Reset the indices for the train and test sets (important for maintaining order after split)
        X_train = X_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)

        # Store the splits for the current target in the dictionary
        data_splits[target] = (X_train, X_test, y_train, y_test)

    # Return the dictionary containing data splits for each target
    return data_splits


# Prepare the data splits for text-based tasks
We apply the `prepare_data` function specifically for text tasks and save the splits for later use.


In [10]:
# Prepare the data splits for the 'text' column using the prepare_data function
text_splits = prepare_data('text')


# Set model configuration and define the device
Here, we configure key parameters for training like maximum sequence length, batch size, and learning rate.
We also determine whether to use GPU or CPU for training based on availability.


In [11]:
# Define configuration settings for the model training
config = {
    'max_len': 512,  # Maximum sequence length for input texts
    'batch_size': 16,  # Batch size for training
    'learning_rate': 2e-5,  # Learning rate for the optimizer
    'epochs': 5,  # Number of training epochs
    'model_name': "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"  # Pre-trained model to use
}

# Determine the device to use for training (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


# Train and evaluate the model for each task
This function performs model training and evaluation for each target.
It uses a neural network to predict labels and calculates the F1 score for evaluation.


In [12]:
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch import nn, optim
from transformers import AutoModelForSequenceClassification, get_scheduler
from sklearn.metrics import f1_score, classification_report
from tqdm import tqdm

# Training and evaluation function
def train_and_evaluate_bert(data_splits, targets):
    f1_scores = []  # Store F1 scores for each task

    for target in targets:
        print(f"\nStarting training for task: {target}")

        # Retrieve data splits
        X_train, X_test, y_train, y_test = data_splits[target]

        # Extract the 'text' column for training and testing
        texts_train = X_train['text'].values
        texts_test = X_test['text'].values

        # Create datasets
        train_dataset = TextDataset(texts_train, y_train, tokenizer, config['max_len'])
        test_dataset = TextDataset(texts_test, y_test, tokenizer, config['max_len'])

        # Create DataLoaders
        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)

        # Number of labels for classification
        num_labels = len(label_encoders[target].classes_)

        # Load pre-trained model
        model = AutoModelForSequenceClassification.from_pretrained(
            config['model_name'], num_labels=num_labels
        ).to(device)

        # Optimizer and scheduler
        optimizer = optim.AdamW(model.parameters(), lr=config['learning_rate'])
        scheduler = get_scheduler(
            "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * config['epochs']
        )

        # Class-weighted loss
        class_counts = np.bincount(y_train)
        class_weights = torch.tensor(1.0 / class_counts, dtype=torch.float).to(device)
        criterion = nn.CrossEntropyLoss(weight=class_weights)

        # Training loop
        for epoch in range(config['epochs']):
            print(f"Epoch {epoch+1}/{config['epochs']} - Training: {target}")
            model.train()
            train_loss = 0
            progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch+1}", leave=True)

            for batch in progress_bar:
                optimizer.zero_grad()

                input_ids = batch['input_ids'].squeeze(1).to(device)
                attention_mask = batch['attention_mask'].squeeze(1).to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                loss.backward()

                # Gradient clipping to prevent exploding gradients
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

                train_loss += loss.item()
                progress_bar.set_postfix(loss=loss.item())

            print(f"Training Loss: {train_loss / len(train_loader):.4f}")

        # Final evaluation on the test set
        print("Final evaluation on test set...")
        model.eval()
        test_preds, test_labels = [], []

        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Testing", leave=True):
                input_ids = batch['input_ids'].squeeze(1).to(device)
                attention_mask = batch['attention_mask'].squeeze(1).to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                _, preds = torch.max(outputs.logits, dim=1)
                test_preds.extend(preds.cpu().numpy())
                test_labels.extend(labels.cpu().numpy())

        # Calculate F1 score
        test_f1 = f1_score(test_labels, test_preds, average='macro')
        f1_scores.append(test_f1)
        print(f"Final Test MACRO F1 for {target}: {test_f1:.4f}")

        # Classification report
        print(f"Classification Report for {target}:\n")
        print(classification_report(test_labels, test_preds, zero_division=0))

        # Save the model and tokenizer
        os.makedirs(f'./model_{target}', exist_ok=True)
        model.save_pretrained(f'./model_{target}')
        tokenizer.save_pretrained(f'./model_{target}')

        # Save the label encoder
        np.save(f'./model_{target}/{target}_label_encoder.npy', label_encoders[target].classes_)
        print(f"Model and LabelEncoder for {target} saved in './model_{target}'")

    return f1_scores



text_f1_scores = train_and_evaluate_bert(text_splits, targets_subtask1 + targets_subtask2)



Starting training for task: hazard-category


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Training: hazard-category


Training Epoch 1: 100%|██████████| 592/592 [03:11<00:00,  3.09it/s, loss=0.163]


Training Loss: 1.0411
Epoch 2/5 - Training: hazard-category


Training Epoch 2: 100%|██████████| 592/592 [03:10<00:00,  3.10it/s, loss=0.0174]


Training Loss: 0.2128
Epoch 3/5 - Training: hazard-category


Training Epoch 3: 100%|██████████| 592/592 [03:10<00:00,  3.10it/s, loss=0.00358]


Training Loss: 0.0856
Epoch 4/5 - Training: hazard-category


Training Epoch 4: 100%|██████████| 592/592 [03:10<00:00,  3.10it/s, loss=0.00431]


Training Loss: 0.0595
Epoch 5/5 - Training: hazard-category


Training Epoch 5: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=0.00152]


Training Loss: 0.0363
Final evaluation on test set...


Testing: 100%|██████████| 148/148 [00:16<00:00,  8.99it/s]


Final Test MACRO F1 for hazard-category: 0.9568
Classification Report for hazard-category:

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       710
           1       0.99      0.98      0.98       703
           2       0.99      0.99      0.99       269
           3       1.00      0.89      0.94        18
           4       0.97      0.96      0.96       258
           5       0.95      0.94      0.94       248
           7       0.93      0.95      0.94        39
           8       0.92      0.93      0.93        75
           9       0.96      0.96      0.96        48

    accuracy                           0.97      2368
   macro avg       0.96      0.95      0.96      2368
weighted avg       0.97      0.97      0.97      2368

Model and LabelEncoder for hazard-category saved in './model_hazard-category'

Starting training for task: product-category


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Training: product-category


Training Epoch 1: 100%|██████████| 592/592 [03:11<00:00,  3.09it/s, loss=1.43]


Training Loss: 2.1730
Epoch 2/5 - Training: product-category


Training Epoch 2: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=0.685]


Training Loss: 0.6330
Epoch 3/5 - Training: product-category


Training Epoch 3: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=0.357]


Training Loss: 0.2434
Epoch 4/5 - Training: product-category


Training Epoch 4: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=0.0396]


Training Loss: 0.1207
Epoch 5/5 - Training: product-category


Training Epoch 5: 100%|██████████| 592/592 [03:10<00:00,  3.10it/s, loss=0.0414]


Training Loss: 0.0764
Final evaluation on test set...


Testing: 100%|██████████| 148/148 [00:16<00:00,  8.95it/s]


Final Test MACRO F1 for product-category: 0.9466
Classification Report for product-category:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        51
           1       0.89      0.89      0.89       254
           2       0.97      0.81      0.88        74
           3       0.89      0.97      0.93       103
           4       1.00      0.87      0.93        69
           5       1.00      0.97      0.98        29
           6       1.00      1.00      1.00         6
           7       1.00      1.00      1.00         8
           8       1.00      1.00      1.00         4
           9       0.93      0.96      0.94       281
          10       0.99      0.94      0.96        78
          11       1.00      1.00      1.00        10
          12       0.90      0.83      0.86        42
          13       0.95      0.95      0.95       610
          14       0.93      0.97      0.95       116
          15       0.95      0.94      0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Training: hazard


Training Epoch 1: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=3.97]


Training Loss: 4.3100
Epoch 2/5 - Training: hazard


Training Epoch 2: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=2.48]


Training Loss: 2.9756
Epoch 3/5 - Training: hazard


Training Epoch 3: 100%|██████████| 592/592 [03:10<00:00,  3.10it/s, loss=1.28]


Training Loss: 1.9392
Epoch 4/5 - Training: hazard


Training Epoch 4: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=0.756]


Training Loss: 1.3166
Epoch 5/5 - Training: hazard


Training Epoch 5: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=1.54]


Training Loss: 1.0287
Final evaluation on test set...


Testing: 100%|██████████| 148/148 [00:16<00:00,  8.96it/s]


Final Test MACRO F1 for hazard: 0.8681
Classification Report for hazard:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.75      1.00      0.86         9
           2       1.00      1.00      1.00        11
           3       0.71      1.00      0.83         5
           4       0.71      0.56      0.62         9
           5       0.86      1.00      0.92        12
           6       1.00      1.00      1.00         5
           7       0.88      1.00      0.93         7
           8       1.00      1.00      1.00         8
           9       0.56      1.00      0.72         9
          10       0.75      1.00      0.86         3
          11       0.89      0.73      0.80        11
          12       1.00      1.00      1.00        10
          13       0.90      0.75      0.82        12
          14       1.00      0.69      0.81        16
          15       1.00      1.00      1.00        10
       

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  class_weights = torch.tensor(1.0 / class_counts, dtype=torch.float).to(device)


Epoch 1/5 - Training: product


Training Epoch 1: 100%|██████████| 592/592 [03:11<00:00,  3.09it/s, loss=5.73]


Training Loss: 6.1077
Epoch 2/5 - Training: product


Training Epoch 2: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=4.83]


Training Loss: 5.4392
Epoch 3/5 - Training: product


Training Epoch 3: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=5.32]


Training Loss: 4.8731
Epoch 4/5 - Training: product


Training Epoch 4: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=4.67]


Training Loss: 4.4698
Epoch 5/5 - Training: product


Training Epoch 5: 100%|██████████| 592/592 [03:11<00:00,  3.10it/s, loss=4.29]


Training Loss: 4.2427
Final evaluation on test set...


Testing: 100%|██████████| 148/148 [00:16<00:00,  8.95it/s]


Final Test MACRO F1 for product: 0.5706
Classification Report for product:

              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           2       1.00      0.33      0.50         6
           4       1.00      0.50      0.67         6
           5       0.00      0.00      0.00         0
           6       1.00      0.56      0.71         9
           7       0.00      0.00      0.00         4
           8       0.75      0.75      0.75        12
           9       0.50      0.17      0.25         6
          10       0.67      0.67      0.67         6
          11       0.92      1.00      0.96        11
          12       1.00      1.00      1.00         7
          13       0.80      1.00      0.89         4
          14       0.67      0.40      0.50         5
          15       0.00      0.00      0.00         1
          17       1.00      1.00      1.00         3
          18       1.00      0.67      0.80         6
     

# Generate predictions on the test data and print the predictions DataFrame
Here, we load the test dataset, use the trained model to generate predictions, and display the results.


In [13]:
# Import necessary libraries
import os
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Load the test data for predictions (CSV containing validation data)
test_path = '/content/drive/MyDrive/Data/validation_data/incidents.csv'
test_df = pd.read_csv(test_path, index_col=0)

# Define the predict function
def predict(texts, model_base_path, target):
    # Load the tokenizer for the specified pre-trained model
    tokenizer = AutoTokenizer.from_pretrained(model_base_path)

    # Load the correct label encoder for the given target
    label_encoder_path = f'{model_base_path}/{target}_label_encoder.npy'
    label_encoder = LabelEncoder()

    # Check if the label encoder file exists and load it
    if os.path.exists(label_encoder_path):
        label_encoder.classes_ = np.load(label_encoder_path, allow_pickle=True)
    else:
        # Print a warning if the label encoder is not found
        print(f"Warning: Label encoder not found for {target} at {label_encoder_path}")
        return None

    # Load the pre-trained model for sequence classification
    model = AutoModelForSequenceClassification.from_pretrained(model_base_path).to(device)

    # Tokenize the input texts
    inputs = tokenizer(
        texts,
        padding=True,  # Pad sequences to the max length
        truncation=True,  # Truncate sequences to the max length
        max_length=512,  # Limit sequence length to 512 tokens
        return_tensors="pt"  # Return PyTorch tensors
    ).to(device)

    # Put the model in evaluation mode
    model.eval()

    # Make predictions with no gradient calculation
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)  # Get the predicted class for each input

    # Decode the predictions using the label encoder
    decoded_predictions = label_encoder.inverse_transform(predictions.cpu().numpy())

    # Return the decoded predictions
    return decoded_predictions

# Define device for model prediction (use GPU if available, else use CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Prepare an empty dataframe to store the predictions
predictions = pd.DataFrame()

# Run predictions for all targets using the correct saved model
for column in targets_subtask1 + targets_subtask2:
    # Define the model path dynamically based on the target column
    model_path = f'./model_{column}'

    # Get the decoded predictions for the current target
    decoded_preds = predict(test_df['text'].tolist(), model_path, column)

    # If predictions were successfully made, store them in the dataframe
    if decoded_preds is not None:
        predictions[column] = decoded_preds

# Display the final predictions
print("\nFinal Predictions:\n")
print(predictions)


Using device: cuda

Final Predictions:

    hazard-category                                   product-category  \
0        biological                       meat, egg and dairy products   
1        biological                       meat, egg and dairy products   
2        biological                       meat, egg and dairy products   
3         allergens                                  ices and desserts   
4    foreign bodies                         prepared dishes and snacks   
..              ...                                                ...   
560       allergens                        cereals and bakery products   
561       allergens  dietetic foods, food supplements, fortified foods   
562  foreign bodies                        cereals and bakery products   
563       allergens                        cereals and bakery products   
564       allergens                                      confectionery   

                           hazard                      product  
0     

In [14]:
predictions

Unnamed: 0,hazard-category,product-category,hazard,product
0,biological,"meat, egg and dairy products",listeria spp,thermal processed beef meat
1,biological,"meat, egg and dairy products",escherichia coli,ground beef
2,biological,"meat, egg and dairy products",enteroviruses,chia seeds
3,allergens,ices and desserts,pecan nut,ice cream
4,foreign bodies,prepared dishes and snacks,listeria spp,tomato sauce
...,...,...,...,...
560,allergens,cereals and bakery products,milk and products thereof,cakes
561,allergens,"dietetic foods, food supplements, fortified foods",milk and products thereof,lollipops
562,foreign bodies,cereals and bakery products,plastic fragment,cakes
563,allergens,cereals and bakery products,peanuts and products thereof,ice cream


# Create the submission folder and archive the results
Finally, predictions and models are saved into a submission directory for easy sharing or evaluation.


In [15]:
import os
from shutil import make_archive
import pandas as pd
from google.colab import drive

# Define the Google Drive path where you want to save the files
output_folder = '/content/drive/MyDrive/submission_augmented_train_set_finetunedPUBMEDBERT/'

# Create the folder in Google Drive if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Save predictions to a CSV file named 'submission.csv' inside the folder
predictions.to_csv(f'{output_folder}submission.csv', index=False)

# Zip the folder for submission
make_archive(output_folder, 'zip', output_folder)

# Print confirmation message
print(f"Submission saved to Google Drive at {output_folder}")


Submission saved to Google Drive at /content/drive/MyDrive/submission_augmented_train_set_finetunedPUBMEDBERT/
