<a href="https://colab.research.google.com/github/steliosg23/PDS-A2/blob/main/SUBMISSION%20Finetuned%20PubMedBERT%20PDS%20A2%20Food%20Hazard%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary packages and import libraries
This section includes all the necessary imports for data manipulation, model training, and evaluation.
It also imports libraries for handling tokenization, model configuration, and metrics.


In [1]:
from google.colab import drive
import pandas as pd
import torch
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import os
from shutil import make_archive
import numpy as np


# Mount Google Drive


In [2]:
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load and preview the training dataset
The dataset containing incident reports is loaded from Google Drive.
We remove any unnecessary columns like 'Unnamed: 0'.


In [3]:
train_path = '/content/drive/MyDrive/Data/incidents_train.csv'
df = pd.read_csv(train_path)
df = df.drop(columns=['Unnamed: 0'])


# Define a function to clean text data
This function removes special characters, converts text to lowercase, and strips extra whitespace.
It is essential to clean the text data for better model performance.


In [4]:
import re

def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    text = text.lower()  # Convert text to lowercase
    text = ' '.join(text.split())  # Remove extra spaces
    return text


# Clean the text data and load the tokenizer
We apply the `clean_text` function to clean the 'text' column of the dataset.
Then, we initialize the PubMedBERT tokenizer to prepare for tokenization.


In [5]:
# Load the tokenizer for the PubMedBERT model, specifically fine-tuned for biomedical text
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

# Apply the text cleaning function to the 'text' column in the DataFrame
# This function will preprocess each text entry by removing unwanted characters, stopwords, etc.
df['text'] = df['text'].apply(clean_text)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Define features and targets for classification tasks
We specify the input features like date and country and set the classification targets.



In [6]:
# Define the features for the model, which include the year, month, day, and country information
features = ['year', 'month', 'day', 'country']

# Define the target variables for Subtask 1, which are the hazard-category and product-category
targets_subtask1 = ['hazard-category', 'product-category']

# Define the target variables for Subtask 2, which are hazard and product
# Add other targets if necessary depending on the task
targets_subtask2 = ['hazard', 'product']


# Encode target labels
For classification, target labels need to be encoded as numeric values.
We use `LabelEncoder` to convert categorical labels into integers.


In [7]:
# Create an empty dictionary to store label encoders for each target
label_encoders = {}

# Iterate over both sets of targets (Subtask 1 and Subtask 2)
for target in targets_subtask1 + targets_subtask2:
    # Initialize a LabelEncoder for each target
    le = LabelEncoder()

    # Transform the target column values into numeric labels and update the DataFrame
    df[target] = le.fit_transform(df[target])

    # Store the fitted LabelEncoder in the dictionary for future use (e.g., inverse transformation)
    label_encoders[target] = le


# Define a custom PyTorch dataset for text classification
This dataset class will handle text tokenization and label processing.
It ensures the text is properly encoded, padded, and truncated to a fixed length for the model.


In [8]:
# Define a custom Dataset class for text data
class TextDataset(Dataset):
    # Initialize the dataset with texts, labels, tokenizer, and maximum sequence length
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts  # List of input texts
        self.labels = labels  # List of corresponding labels
        self.tokenizer = tokenizer  # Tokenizer for encoding the text
        self.max_len = max_len  # Maximum length for padding/truncation

    # Define the length of the dataset (number of samples)
    def __len__(self):
        return len(self.texts)

    # Define how to retrieve a single item from the dataset
    def __getitem__(self, item):
        text = str(self.texts[item])  # Get the text for the given index
        label = self.labels[item]  # Get the label for the given index

        # Use the tokenizer to encode the text (add special tokens, padding, truncation)
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add special tokens (e.g., [CLS], [SEP])
            max_length=self.max_len,  # Limit the sequence length
            padding='max_length',  # Pad sequences to max_length
            truncation=True,  # Truncate longer sequences
            return_tensors='pt'  # Return PyTorch tensors
        )

        # Return a dictionary with input_ids, attention_mask, and label
        return {
            'input_ids': encoding['input_ids'].flatten(),  # Flatten the tensor
            'attention_mask': encoding['attention_mask'].flatten(),  # Flatten the attention mask
            'label': torch.tensor(label, dtype=torch.long)  # Convert label to a tensor
        }


# Split the data into training and testing sets
We split the dataset into training and testing sets for each target.
This ensures that the model is trained on one set and evaluated on a separate, unseen set.


In [9]:
# Define a function to prepare data for model training and testing
def prepare_data(text_column):
    # Extract features and text column from the DataFrame
    X = df[features + [text_column]]  # Features include specified columns plus the text column
    # Extract target variables for Subtask 1 and Subtask 2
    y_subtask1 = df[targets_subtask1]
    y_subtask2 = df[targets_subtask2]

    # Initialize a dictionary to store data splits for each target
    data_splits = {}

    # Iterate over both sets of target variables (Subtask 1 and Subtask 2)
    for target in targets_subtask1 + targets_subtask2:
        # Split the data into training and testing sets (80% train, 20% test)
        X_train, X_test, y_train, y_test = train_test_split(
            X, df[target], test_size=0.2, random_state=42
        )

        # Reset the indices for the train and test sets (important for maintaining order after split)
        X_train = X_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)

        # Store the splits for the current target in the dictionary
        data_splits[target] = (X_train, X_test, y_train, y_test)

    # Return the dictionary containing data splits for each target
    return data_splits


# Prepare the data splits for text-based tasks
We apply the `prepare_data` function specifically for text tasks and save the splits for later use.


In [10]:
# Prepare the data splits for the 'text' column using the prepare_data function
text_splits = prepare_data('text')


# Set model configuration and define the device
Here, we configure key parameters for training like maximum sequence length, batch size, and learning rate.
We also determine whether to use GPU or CPU for training based on availability.


In [11]:
# Define configuration settings for the model training
config = {
    'max_len': 512,  # Maximum sequence length for input texts
    'batch_size': 16,  # Batch size for training
    'learning_rate': 2e-5,  # Learning rate for the optimizer
    'epochs': 5,  # Number of training epochs
    'model_name': "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"  # Pre-trained model to use
}

# Determine the device to use for training (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


# Train and evaluate the model for each task
This function performs model training and evaluation for each target.
It uses a neural network to predict labels and calculates the F1 score for evaluation.


In [12]:
# Train and evaluate the neural network for each target task
def train_and_evaluate_bert(data_splits, targets):
    # Initialize an empty list to store F1 scores for each target task
    f1_scores = []

    # Loop through each target (task) for training and evaluation
    for target in targets:
        print(f"\nStarting training for task: {target}")

        # Retrieve the corresponding training and testing splits
        X_train, X_test, y_train, y_test = data_splits[target]

        # Extract the 'text' column for training and testing
        texts_train = X_train['text'].values
        texts_test = X_test['text'].values

        # Create datasets for training and testing
        train_dataset = TextDataset(texts_train, y_train, tokenizer, config['max_len'])
        test_dataset = TextDataset(texts_test, y_test, tokenizer, config['max_len'])

        # Create data loaders for batching during training and testing
        train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)

        # Determine the number of labels for classification
        num_labels = len(label_encoders[target].classes_)

        # Load the pre-trained model with the appropriate number of labels for classification
        model = AutoModelForSequenceClassification.from_pretrained(config['model_name'], num_labels=num_labels).to(device)

        # Initialize the optimizer and loss function
        optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
        criterion = nn.CrossEntropyLoss()

        # Training loop
        model.train()
        for epoch in range(config['epochs']):
            print(f"Epoch {epoch+1}/{config['epochs']} - Training: {target}")
            progress_bar = tqdm(train_loader, desc=f"Training Epoch {epoch+1}", total=len(train_loader), leave=True)
            for batch in progress_bar:
                optimizer.zero_grad()

                # Get the input data and labels for the current batch
                input_ids = batch['input_ids'].squeeze(1).to(device)
                attention_mask = batch['attention_mask'].squeeze(1).to(device)
                labels = batch['label'].to(device)

                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)

                # Backward pass and optimization step
                loss.backward()
                optimizer.step()

                # Update the progress bar with the current loss
                progress_bar.set_postfix(loss=loss.item())

        # Evaluate the model after training
        print(f"Evaluating model for task: {target}")
        model.eval()
        y_preds = []
        y_true = []

        # Evaluate without computing gradients
        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Evaluating", total=len(test_loader), leave=True):
                input_ids = batch['input_ids'].squeeze(1).to(device)
                attention_mask = batch['attention_mask'].squeeze(1).to(device)
                labels = batch['label'].to(device)

                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask)

                # Get predictions and append them to the list
                _, preds = torch.max(outputs.logits, dim=1)
                y_preds.extend(preds.cpu().numpy())
                y_true.extend(labels.cpu().numpy())

        # Decode the predictions and true labels
        decoded_preds = label_encoders[target].inverse_transform(y_preds)
        decoded_true = label_encoders[target].inverse_transform(y_true)

        # Calculate the weighted F1 score
        f1 = f1_score(decoded_true, decoded_preds, average='weighted')
        f1_scores.append(f1)
        print(f"F1-Score for {target}: {f1}")

        # Print the classification report
        print(f"Classification Report for {target}:\n")
        print(classification_report(decoded_true, decoded_preds, zero_division=0))

        # Save the model and tokenizer for future use
        os.makedirs(f'./model_{target}', exist_ok=True)
        model.save_pretrained(f'./model_{target}')
        tokenizer.save_pretrained(f'./model_{target}')

        # Save the label encoder for the current target
        np.save(f'./model_{target}/{target}_label_encoder.npy', label_encoders[target].classes_)
        print(f"Model and LabelEncoder for {target} saved in './model_{target}'")

    # Return the F1 scores for each target task
    return f1_scores

# Train and evaluate for all targets (subtasks 1 and 2)
text_f1_scores = train_and_evaluate_bert(text_splits, targets_subtask1 + targets_subtask2)



Starting training for task: hazard-category


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Training: hazard-category


Training Epoch 1: 100%|██████████| 255/255 [01:26<00:00,  2.96it/s, loss=0.0447]


Epoch 2/5 - Training: hazard-category


Training Epoch 2: 100%|██████████| 255/255 [01:25<00:00,  3.00it/s, loss=0.0163]


Epoch 3/5 - Training: hazard-category


Training Epoch 3: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.0174]


Epoch 4/5 - Training: hazard-category


Training Epoch 4: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.00763]


Epoch 5/5 - Training: hazard-category


Training Epoch 5: 100%|██████████| 255/255 [01:25<00:00,  3.00it/s, loss=0.00675]


Evaluating model for task: hazard-category


Evaluating: 100%|██████████| 64/64 [00:08<00:00,  7.96it/s]


F1-Score for hazard-category: 0.9403554393464585
Classification Report for hazard-category:

                                precision    recall  f1-score   support

                     allergens       0.94      0.99      0.97       377
                    biological       0.99      0.99      0.99       339
                      chemical       0.89      0.97      0.93        68
food additives and flavourings       0.50      0.20      0.29         5
                foreign bodies       0.96      0.99      0.97       111
                         fraud       0.91      0.62      0.74        68
                     migration       0.00      0.00      0.00         1
          organoleptic aspects       0.75      0.60      0.67        10
                  other hazard       0.72      0.78      0.75        27
              packaging defect       1.00      0.55      0.71        11

                      accuracy                           0.94      1017
                     macro avg       0.77

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Training: product-category


Training Epoch 1: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=1.77]


Epoch 2/5 - Training: product-category


Training Epoch 2: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=2.98]


Epoch 3/5 - Training: product-category


Training Epoch 3: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.43]


Epoch 4/5 - Training: product-category


Training Epoch 4: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.037]


Epoch 5/5 - Training: product-category


Training Epoch 5: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.35]


Evaluating model for task: product-category


Evaluating: 100%|██████████| 64/64 [00:08<00:00,  7.89it/s]


F1-Score for product-category: 0.7645394969222248
Classification Report for product-category:

                                                   precision    recall  f1-score   support

                              alcoholic beverages       0.71      0.71      0.71         7
                      cereals and bakery products       0.71      0.79      0.75       123
     cocoa and cocoa preparations, coffee and tea       0.67      0.86      0.75        49
                                    confectionery       0.64      0.57      0.61        40
dietetic foods, food supplements, fortified foods       0.68      0.79      0.73        24
                                    fats and oils       1.00      0.50      0.67         4
                                   feed materials       0.00      0.00      0.00         3
                           food contact materials       0.00      0.00      0.00         1
                            fruits and vegetables       0.74      0.79      0.76     

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Training: hazard


Training Epoch 1: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.357]


Epoch 2/5 - Training: hazard


Training Epoch 2: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.0411]


Epoch 3/5 - Training: hazard


Training Epoch 3: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.29]


Epoch 4/5 - Training: hazard


Training Epoch 4: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.0186]


Epoch 5/5 - Training: hazard


Training Epoch 5: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=0.0187]


Evaluating model for task: hazard


Evaluating: 100%|██████████| 64/64 [00:08<00:00,  7.94it/s]


F1-Score for hazard: 0.8194393623685172
Classification Report for hazard:

                                                   precision    recall  f1-score   support

                                        Aflatoxin       1.00      1.00      1.00         4
                                  alcohol content       0.00      0.00      0.00         1
                                        alkaloids       0.00      0.00      0.00         2
                                        allergens       0.00      0.00      0.00         4
                                           almond       0.88      1.00      0.93        14
             altered organoleptic characteristics       0.00      0.00      0.00         2
                                        amygdalin       0.00      0.00      0.00         2
                           antibiotics, vet drugs       0.00      0.00      0.00         1
                                    bacillus spp.       1.00      1.00      1.00         1
              

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Training: product


Training Epoch 1: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=6.3]


Epoch 2/5 - Training: product


Training Epoch 2: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=6.67]


Epoch 3/5 - Training: product


Training Epoch 3: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=7.04]


Epoch 4/5 - Training: product


Training Epoch 4: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=6.16]


Epoch 5/5 - Training: product


Training Epoch 5: 100%|██████████| 255/255 [01:25<00:00,  2.99it/s, loss=2.89]


Evaluating model for task: product


Evaluating: 100%|██████████| 64/64 [00:08<00:00,  7.95it/s]


F1-Score for product: 0.23249023103383826
Classification Report for product:

                                                   precision    recall  f1-score   support

                           Catfishes (freshwater)       0.80      0.80      0.80         5
                            Fishes not identified       0.25      1.00      0.40         6
                         Not classified pork meat       0.00      0.00      0.00         3
                       Pangas catfishes (generic)       0.00      0.00      0.00         1
              Precooked cooked pork meat products       0.00      0.00      0.00         1
                                    Veggie Burger       0.00      0.00      0.00         2
                               after dinner mints       0.00      0.00      0.00         1
                                  alfalfa sprouts       0.00      0.00      0.00         0
                                            algae       0.00      0.00      0.00         3
           

# Generate predictions on the test data and print the predictions DataFrame
Here, we load the test dataset, use the trained model to generate predictions, and display the results.


In [13]:
# Import necessary libraries
import os
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Load the test data for predictions (CSV containing validation data)
test_path = '/content/drive/MyDrive/Data/validation_data/incidents.csv'
test_df = pd.read_csv(test_path, index_col=0)

# Define the predict function
def predict(texts, model_base_path, target):
    # Load the tokenizer for the specified pre-trained model
    tokenizer = AutoTokenizer.from_pretrained(model_base_path)

    # Load the correct label encoder for the given target
    label_encoder_path = f'{model_base_path}/{target}_label_encoder.npy'
    label_encoder = LabelEncoder()

    # Check if the label encoder file exists and load it
    if os.path.exists(label_encoder_path):
        label_encoder.classes_ = np.load(label_encoder_path, allow_pickle=True)
    else:
        # Print a warning if the label encoder is not found
        print(f"Warning: Label encoder not found for {target} at {label_encoder_path}")
        return None

    # Load the pre-trained model for sequence classification
    model = AutoModelForSequenceClassification.from_pretrained(model_base_path).to(device)

    # Tokenize the input texts
    inputs = tokenizer(
        texts,
        padding=True,  # Pad sequences to the max length
        truncation=True,  # Truncate sequences to the max length
        max_length=512,  # Limit sequence length to 512 tokens
        return_tensors="pt"  # Return PyTorch tensors
    ).to(device)

    # Put the model in evaluation mode
    model.eval()

    # Make predictions with no gradient calculation
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)  # Get the predicted class for each input

    # Decode the predictions using the label encoder
    decoded_predictions = label_encoder.inverse_transform(predictions.cpu().numpy())

    # Return the decoded predictions
    return decoded_predictions

# Define device for model prediction (use GPU if available, else use CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Prepare an empty dataframe to store the predictions
predictions = pd.DataFrame()

# Run predictions for all targets using the correct saved model
for column in targets_subtask1 + targets_subtask2:
    # Define the model path dynamically based on the target column
    model_path = f'./model_{column}'

    # Get the decoded predictions for the current target
    decoded_preds = predict(test_df['text'].tolist(), model_path, column)

    # If predictions were successfully made, store them in the dataframe
    if decoded_preds is not None:
        predictions[column] = decoded_preds

# Display the final predictions
print("\nFinal Predictions:\n")
print(predictions)


Using device: cuda

Final Predictions:

    hazard-category                                   product-category  \
0        biological                       meat, egg and dairy products   
1        biological                       meat, egg and dairy products   
2        biological                       meat, egg and dairy products   
3         allergens                                  ices and desserts   
4    foreign bodies                       meat, egg and dairy products   
..              ...                                                ...   
560       allergens                              fruits and vegetables   
561       allergens  dietetic foods, food supplements, fortified foods   
562  foreign bodies                        cereals and bakery products   
563       allergens                        cereals and bakery products   
564       allergens       cocoa and cocoa preparations, coffee and tea   

                           hazard                 product  
0          

In [14]:
predictions

Unnamed: 0,hazard-category,product-category,hazard,product
0,biological,"meat, egg and dairy products",listeria monocytogenes,ham
1,biological,"meat, egg and dairy products",escherichia coli,salads
2,biological,"meat, egg and dairy products",enteroviruses,chicken based products
3,allergens,ices and desserts,pecan nut,ice cream
4,foreign bodies,"meat, egg and dairy products",metal fragment,chicken based products
...,...,...,...,...
560,allergens,fruits and vegetables,cashew,salads
561,allergens,"dietetic foods, food supplements, fortified foods",milk and products thereof,biscuits
562,foreign bodies,cereals and bakery products,plastic fragment,cookies
563,allergens,cereals and bakery products,peanuts and products thereof,biscuits


# Create the submission folder and archive the results
Finally, predictions and models are saved into a submission directory for easy sharing or evaluation.


In [15]:
import os
from shutil import make_archive
import pandas as pd
from google.colab import drive

# Define the Google Drive path where you want to save the files
output_folder = '/content/drive/MyDrive/submission_finetunedPUBMEDBERT/'

# Create the folder in Google Drive if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Save predictions to a CSV file named 'submission.csv' inside the folder
predictions.to_csv(f'{output_folder}submission.csv', index=False)

# Zip the folder for submission
make_archive(output_folder, 'zip', output_folder)

# Print confirmation message
print(f"Submission saved to Google Drive at {output_folder}")


Submission saved to Google Drive at /content/drive/MyDrive/submission_finetunedPUBMEDBERT/
