[link text](https://)# Pre-Trained Word Embeddings for Text Classification

# Imports/Installations

In [8]:
!pip install transformers
!pip install -U datasets
!pip install setfit

from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer



In [9]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [10]:
from collections import defaultdict, Counter
import json
import numpy as np
import torch
import pandas as pd
import random
random_seed = 10
random.seed(random_seed)


from matplotlib import pyplot as plt

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel
from transformers import get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from torch.optim import AdamW
from transformers import set_seed
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset, DatasetDict, Dataset
from torch.utils.data import DataLoader

## New Way of Choosing Data

In [11]:
from sklearn.model_selection import train_test_split
dominant_topic_df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/dominant_topic_results.csv')

# Assuming dominant_topic_df is the DataFrame you want to split
# Define the features (X) and the target (y)
# Here, we'll use the 'Text' column as the feature
X = dominant_topic_df['Text']

# And the topic probability columns and Dominant_Topic as the target (y)
# Changed to include Topic_0 through Topic_5 instead of the percentage columns
topic_columns = [col for col in dominant_topic_df.columns if col.startswith('Topic_') and len(col) == 7 and not col.endswith('_Perc')]
y = dominant_topic_df[['Document_Num'] + topic_columns]


# Perform the 80/20 split
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,  # 20% for validation
    random_state=random_seed # Use the predefined random_seed for reproducibility
    # Stratify is not directly applicable to multi-output targets like this,
    # so we will remove it.
)

print("Training set size:", len(X_train))
print("Validation set size:", len(X_val))
print("\nShape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)

y

Training set size: 309
Validation set size: 78

Shape of X_train: (309,)
Shape of X_val: (78,)
Shape of y_train: (309, 7)
Shape of y_val: (78, 7)


Unnamed: 0,Document_Num,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5
0,0,0,0,0,0,1,0
1,1,0,0,0,0,1,0
2,2,1,0,0,0,0,0
3,3,0,0,1,0,0,0
4,4,0,0,0,0,0,1
...,...,...,...,...,...,...,...
382,382,1,0,0,0,0,0
383,383,0,0,1,0,0,1
384,384,0,1,0,0,0,0
385,385,0,1,0,1,0,0


# Loading and formatting CSVs

In [12]:
# df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_results.csv')
# test = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_TEST.csv')
# train = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_TRAIN.csv')
# validation = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_VAL.csv')

# # turning into hugging face format
# test = Dataset.from_pandas(test)
# train = Dataset.from_pandas(train)
# validation = Dataset.from_pandas(validation)

In [13]:
# df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_results.csv')
# test = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_TEST.csv')
# train = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/Classification Data/Base/BERTopic_TRAIN_80.csv')
# validation = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/Classification Data/Base/BERTopic_VAL_20.csv')
# # dominant_topic_df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/dominant_topic_results.csv')

# # turning into hugging face format
# test = Dataset.from_pandas(test)
# train = Dataset.from_pandas(train)
# validation = Dataset.from_pandas(validation)

In [14]:
test_df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/gbt_classified_grievances')

# test_df = test_df[['pk', 'summary', 'GPT_Topic_0', 'GPT_Topic_1', 'GPT_Topic_2', 'GPT_Topic_3', 'GPT_Topic_4', "GPT_Topic_5"]]
test_df = test_df[['pk', 'summary', 'Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', "Topic_5"]]

display(test_df.head())

FileNotFoundError: [Errno 2] No such file or directory: '/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/gbt_classified_grievances'

In [None]:
# word_counts = df['Text'].astype(str).apply(lambda x: len(x.split()))

# max_words = word_counts.max()
# min_words = word_counts.min()
# mean_words = word_counts.mean()

# print(f"Max words in summary: {max_words}")
# print(f"Min words in summary: {min_words}")
# print(f"Mean words in summary: {mean_words}")


# Dataset Tokenization

## Example Tokenization

In [None]:
name = "SpanBERT/spanbert-large-cased"
tokenizer = AutoTokenizer.from_pretrained(name)

sample_input = "We want to use a pretrained tokenizer."
tokenized_inputs = tokenizer(sample_input,
                             return_tensors="pt",
                             padding=True,
                             truncation=True,
                             max_length=512)
print(tokenized_inputs["input_ids"])

We will use the function that we use to test the tokenizer on a single input.

## Tokenized Train Dataset

In [None]:
tokenizer_length = 512

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-large-cased")

# # Apply tokenization using map
# tokenized_train = train.map(
#     lambda example: tokenizer(example['Text'],
#                              padding="max_length",
#                              truncation=True,
#                              max_length=tokenizer_length)  # not sure what length to use
# )

# # Remove the original text column (we don't need it after tokenization)
# tokenized_train = tokenized_train.remove_columns(['Text'])

# # Rename 'Dominant_Topic' to 'labels' (standard for transformers)
# tokenized_train = tokenized_train.rename_column("Dominant_Topic", "labels")

# # Step 4: Set format to torch tensors
# tokenized_train.set_format("torch")

# # Check the results
# print("Tokenized dataset features:", tokenized_train.column_names)
# print("Dataset size:", len(tokenized_train))
# print("\nSample data shapes:")
# #print(f"- input_ids: {tokenized_train[0]['input_ids'].shape}")
# print(f"- attention_mask: {tokenized_train[0]['attention_mask'].shape}")
# print(f"- labels: {tokenized_train[0]['labels']}")
# print(f"- labels type: {type(tokenized_train[0]['labels'])}")

In [None]:
topic_columns = [col for col in dominant_topic_df.columns if col.startswith('Topic_') and len(col) == 7 and not col.endswith('_Perc')]
topic_columns_test = [col for col in test_df.columns if col.startswith('GPT_Topic_') and len(col) == 10 and not col.endswith('_Perc')]
train_dataset = Dataset.from_pandas(pd.DataFrame({'Text': X_train, 'labels': y_train[topic_columns].values.tolist()}))
val_dataset = Dataset.from_pandas(pd.DataFrame({'Text': X_val, 'labels': y_val[topic_columns].values.tolist()}))

# Apply tokenization using map
tokenized_train = train_dataset.map(
    lambda example: tokenizer(example['Text'],
                             padding="max_length",
                             truncation=True,
                             max_length=tokenizer_length),
    batched=True
)

# Remove the original text column (we don't need it after tokenization)
tokenized_train = tokenized_train.remove_columns(['Text'])

# Set format to torch tensors
tokenized_train.set_format("torch")

# Check the results
print("Tokenized training dataset features:", tokenized_train.column_names)
print("Training dataset size:", len(tokenized_train))
print("\nSample training data shapes:")
print(f"- input_ids: {tokenized_train[0]['input_ids'].shape}")
print(f"- attention_mask: {tokenized_train[0]['attention_mask'].shape}")
print(f"- labels shape: {tokenized_train[0]['labels'].shape}")
print(f"- labels type: {type(tokenized_train[0]['labels'])}")


tokenized_validation = val_dataset.map(
    lambda example: tokenizer(example['Text'],
                             padding="max_length",
                             truncation=True,
                             max_length=tokenizer_length,),
    batched=True
)

tokenized_validation = tokenized_validation.remove_columns(['Text'])

tokenized_validation.set_format("torch")

print("\nTokenized validation dataset size:", len(tokenized_validation))
print("Sample validation data shapes:")
print(f"- input_ids: {tokenized_validation[0]['input_ids'].shape}")
print(f"- attention_mask: {tokenized_validation[0]['attention_mask'].shape}")
print(f"- labels shape: {tokenized_validation[0]['labels'].shape}")
print(f"- labels type: {type(tokenized_validation[0]['labels'])}")


# Test
# Separate features and labels for the test set
# Use 'summary' as the text feature and the multi-hot encoded topics as the labels
X_test = test_df['summary']
# Convert the multi-hot encoded topic labels to a list of lists
y_test_labels_list = test_df[topic_columns_test].values.tolist()
test_pk = test_df['pk'] # Keep 'pk' separately

test_dataset = Dataset.from_pandas(pd.DataFrame({'Text': X_test, 'labels': y_test_labels_list, 'pk': test_pk}))

tokenized_test = test_dataset.map(
     lambda example: tokenizer(example['Text'],
                              padding="max_length",
                              truncation=True,
                              max_length=tokenizer_length),
    batched=True
)

tokenized_test = tokenized_test.remove_columns(['Text'])
tokenized_test.set_format("torch")

print("\nTokenized test dataset size:", len(tokenized_test))
print("Sample test data shapes:")
print(f"- input_ids: {tokenized_test[0]['input_ids'].shape}")
print(f"- attention_mask: {tokenized_test[0]['attention_mask'].shape}")
# The labels for the test set are now the multi-hot encoded labels
print(f"- labels: {tokenized_test[0]['labels']}")
print(f"- labels type: {type(tokenized_test[0]['labels'])}")
print(f"- pk: {tokenized_test[0]['pk']}")
print(f"- pk type: {type(tokenized_test[0]['pk'])}")

## Tokenized Test Dataset

In [None]:
# # Apply the same process to your test set
# tokenized_test = test.map(
#     lambda example: tokenizer(example['Text'],
#                              padding="max_length",
#                              truncation=True,
#                              max_length=tokenizer_length)
# )

# tokenized_test = tokenized_test.remove_columns(['Text'])
# tokenized_test = tokenized_test.rename_column("Dominant_Topic", "labels")
# tokenized_test.set_format("torch")

# print("Test dataset size:", len(tokenized_test))

In [None]:
# tokenized_test['labels']

In [None]:
tokenized_train['labels'][5]

## Tokenized Validation Dataset

In [None]:
# Apply the same process to your test set
tokenized_validation = validation.map(
    lambda example: tokenizer(example['Text'],
                             padding="max_length",
                             truncation=True,
                             max_length=tokenizer_length,)
)

tokenized_validation = tokenized_validation.remove_columns(['Text'])
tokenized_validation = tokenized_validation.rename_column("Dominant_Topic", "labels")
tokenized_validation.set_format("torch")

print("Test dataset size:", len(tokenized_validation))

## Check Tokenization on train and test

In [None]:
#lets check our tokenization for a few samples
tokenized_train[0:2]

In [None]:
# #lets check our tokenization for a few samples
# tokenized_test[0:2]

In [None]:
tokenized_validation[0:2]

# Using DataLoader to Batchify data
Make sure to send your datasets to the Dataloader in order to segment your dataset into batches. Remember that we need batches to run our iterative optimization procedure which is typically some form of Mini-batch Gradient Descent.

In the interest of time we want to finetune our model on a sample of the training set with 309 records instead of the entire 62K sample size

## batch_size

In [None]:
batch_size = 4

In [None]:
# dataset = load_dataset("SetFit/SentEval-CR")
tokenized_train

In [None]:
# df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_results.csv')
# test = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_TEST.csv')
# train = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/Classification Data/Base/BERTopic_TRAIN_80.csv')
# validation = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/Classification Data/Base/BERTopic_VAL_20.csv')

# turning into hugging face format
test = Dataset.from_pandas(test)
train = Dataset.from_pandas(train)
validation = Dataset.from_pandas(validation)

In [None]:
train_dataset = tokenized_train.shuffle(seed=1111).select(range(305))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
eval_dataloader = DataLoader(tokenized_validation, batch_size=batch_size)

In [None]:
# # Select N examples per class (8 in this case)
# train_ds = tokenized_train.shuffle(seed=42).select(range(8 * 6))
# test_ds = dataset["test"]

# Training and Validation
We have now gone through all the required preprocessing to prep the data for training. Instead of the Trainer module, it will be a good practice to initially write our own training loops so that we are mindful of all the steps that required for training neural networks.

Other than our training and validation data we need to select:

An optimizer to run backpropagation
A scheduler that sets a protocol for parameter updates at the end of a batch
We would also like to set a seed at the start of computation. This ensures that we are able to generate reproducicble results across multiple training sessions.

We run validation at the end of each epoch.

**At the end of this step we want to report the best validation loss obtained during training. We also want to save the model corresponding to the epoch that reported the best validation loss.**

In [None]:
model_path = '/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/Spanbert_Saved_Models/Spanbert '

In [None]:
from torch.nn.utils import clip_grad_norm_
set_seed(42)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    "SpanBERT/spanbert-large-cased",
    num_labels=6
).to(device)

num_epochs = 20
num_training_steps = len(train_dataloader) * num_epochs

# optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.005)  # Halved Learning Rate

# lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),  # 10% warmup
    num_training_steps=num_training_steps
)

# For Graph
x_epochs = []
y_train = []
y_val = []

best_val_loss = float("inf")

early_stopping = True
early_count = 0 # +1 every time validation loss doesnt improve

progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    x_epochs.append(epoch)

    # training
    model.train()
    training_losses = []
    for batch_i, batch in enumerate(train_dataloader):

        optimizer.zero_grad()

        # copy input to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # output = model(**batch)
        output = model(input_ids, attention_mask=attention_mask, labels=labels)
        training_loss = output.loss
        training_losses.append(training_loss.item())

        #backprop and update params by taking an optimization step
        output.loss.backward()
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
    print(f"Epoch {epoch}:")
    print("Mean Training Loss", np.mean(training_losses))
    y_train.append(np.mean(training_losses))

    # validation
    val_loss = 0
    model.eval() #important to call because we dont want to collect gradients
    for batch_i, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            # copy input to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # output = model(**batch)
            output = model(input_ids, attention_mask=attention_mask, labels=labels)
        val_loss += output.loss

    avg_val_loss = val_loss / len(eval_dataloader)
    print(f"Validation loss: {avg_val_loss}")
    y_val.append(avg_val_loss.cpu())

    if avg_val_loss < best_val_loss:
        print("Saving checkpoint!")
        early_count = 0 # Reset counter
        best_val_loss = avg_val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            # 'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss,
            },
            f"{model_path}best_model.pt"
        )
    elif early_stopping:
        early_count += 1

        if early_count == 5:
            print(f"Validation loss has not improved for {early_count} iterations; Early Stopping.")
            break

    print()

In [None]:
# model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

# trainer = SetFitTrainer(
#     model=model,
#     train_dataset=train_ds,
#     eval_dataset=test_ds,
#     loss_class=CosineSimilarityLoss,
#     batch_size=16,
#     num_iterations=20,
#     num_epochs=1,
#     learning_rate=1e-5,   # <-- adjust this
#     weight_decay=0.005     # <-- optional
# )

# # For Graph
# x_epochs = []
# y_train = []
# y_val = []

# best_val_loss = float("inf")

# early_stopping = True
# early_count = 0 # +1 every time validation loss doesnt improve

# progress_bar = tqdm(range(num_training_steps))
# for epoch in range(num_epochs):
#     x_epochs.append(epoch)

#     # training
#     trainer.train()
#     metrics = trainer.evaluate()
#     training_losses = []
#     for batch_i, batch in enumerate(train_dataloader):

#         optimizer.zero_grad()

#         # copy input to device
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)

#         # output = model(**batch)
#         output = model(input_ids, attention_mask=attention_mask, labels=labels)
#         training_loss = output.loss
#         training_losses.append(training_loss.item())

#         #backprop and update params by taking an optimization step
#         output.loss.backward()
#         optimizer.step()
#         lr_scheduler.step()
#         progress_bar.update(1)
#     print(f"Epoch {epoch}:")
#     print("Mean Training Loss", np.mean(training_losses))
#     y_train.append(np.mean(training_losses))

#     # validation
#     val_loss = 0
#     model.eval() #important to call because we dont want to collect gradients
#     for batch_i, batch in enumerate(eval_dataloader):
#         with torch.no_grad():
#             # copy input to device
#             input_ids = batch['input_ids'].to(device)
#             attention_mask = batch['attention_mask'].to(device)
#             labels = batch['labels'].to(device)
#             # output = model(**batch)
#             output = model(input_ids, attention_mask=attention_mask, labels=labels)
#         val_loss += output.loss

#     avg_val_loss = val_loss / len(eval_dataloader)
#     print(f"Validation loss: {avg_val_loss}")
#     y_val.append(avg_val_loss.cpu())

#     if avg_val_loss < best_val_loss:
#         print("Saving checkpoint!")
#         early_count = 0 # Reset counter
#         best_val_loss = avg_val_loss
#         torch.save({
#             'epoch': epoch,
#             'model_state_dict': model.state_dict(),
#             # 'optimizer_state_dict': optimizer.state_dict(),
#             'val_loss': best_val_loss,
#             },
#             f"{model_path}best_model.pt"
#         )
#     elif early_stopping:
#         early_count += 1

#         if early_count == 5:
#             print(f"Validation loss has not improved for {early_count} iterations; Early Stopping.")
#             break

#     print()

In [None]:
!pip install optuna

In [None]:
# plt.plot(x_epochs, y_val)

plt.plot(x_epochs, y_val, label="Validation Loss")
plt.plot(x_epochs, y_train, label="Mean Training Loss")
plt.legend()
plt.style.use('fivethirtyeight')
plt.title("Validation Loss")
plt.xlabel("Epochs")

In [None]:
# Load the best model
model = AutoModelForSequenceClassification.from_pretrained(
    "SpanBERT/spanbert-large-cased",
    num_labels=6
).to(device)

best_model_path = f"{model_path}best_model.pt"
checkpoint = torch.load(best_model_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval() # Set the model to evaluation mode

# Create a DataLoader for the test set
test_dataloader = DataLoader(tokenized_test, batch_size=batch_size)

predictions = []
pk_values = [] # To store pk values

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Predicting on test data"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # Move pk to CPU and convert to numpy
        pk = batch['pk'].cpu().numpy()
        pk_values.extend(pk)


        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Apply sigmoid to get probabilities
        probs = torch.sigmoid(logits)
        predictions.extend(probs.cpu().numpy())

# Convert predictions to a DataFrame
prediction_df = pd.DataFrame(predictions, columns=[f'Topic_{i}_Prob' for i in range(6)])

# Add the original text and pk back to the prediction DataFrame
# We need to get the original text and pk from the test_df or tokenized_test dataset
# Since tokenized_test has 'pk' and we can get 'summary' from test_df using 'pk'
# Let's create a mapping from pk to summary from the original test_df
pk_to_text = test_df.set_index('pk')['summary'].to_dict()

# Add 'pk' and 'summary' to the prediction_df
prediction_df['pk'] = pk_values # Use the collected numpy array of pk values
prediction_df['summary'] = prediction_df['pk'].map(pk_to_text)

# Rearrange columns to have pk and summary first
cols = ['pk', 'summary'] + [col for col in prediction_df.columns if col not in ['pk', 'summary']]
prediction_df = prediction_df[cols]

display(prediction_df.head())

In [None]:
# # Modify the location per model
# model_location = "/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/Spanbert_Saved_Models/Spanbert best_model.pt"
# checkpoint = torch.load(model_location)

# # Recreate the model architecture
# model = AutoModelForSequenceClassification.from_pretrained(
#     "SpanBERT/spanbert-large-cased",
#     num_labels=6
# ).to(device)
# #
# # Load the saved weights
# model.load_state_dict(checkpoint["model_state_dict"])
# model.to(device)
# model.eval()

In [None]:
# from sklearn.model_selection import train_test_split

# # Assuming dominant_topic_df is the DataFrame you want to split
# # Define the features (X) and the target (y)
# # Here, we'll use the 'Text' column as the feature
# X = dominant_topic_df['Text']

# # And the topic probability columns and Dominant_Topic as the target (y)
# # Changed to include Topic_0 through Topic_5 instead of the percentage columns
# topic_columns = [col for col in dominant_topic_df.columns if col.startswith('Topic_') and len(col) == 7 and not col.endswith('_Perc')]
# y = dominant_topic_df[['Document_Num'] + topic_columns]


# # Perform the 80/20 split
# X_train, X_val, y_train, y_val = train_test_split(
#     X, y,
#     test_size=0.2,  # 20% for validation
#     random_state=random_seed # Use the predefined random_seed for reproducibility
#     # Stratify is not directly applicable to multi-output targets like this,
#     # so we will remove it.
# )

# print("Training set size:", len(X_train))
# print("Validation set size:", len(X_val))
# print("\nShape of X_train:", X_train.shape)
# print("Shape of X_val:", X_val.shape)
# print("Shape of y_train:", y_train.shape)
# print("Shape of y_val:", y_val.shape)

# y

In [None]:
test_df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/classified_grievances_multilabel.csv')

test_df = test_df[['pk', 'summary', 'Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', "Topic_5"]]

#### **Evaluate your model on Test Data**

Now we use our finetuned model to evaluate the test set. We use performance metrics from `sklearn.metrics` to test the effectiveness of our model on unseen test data.

In order to do that, run the finetuned model you have just saved on your test data and report the following performance metrics:



*   Accuracy
*   F1 Score

In [None]:
eval_dataloader = DataLoader(tokenized_validation, batch_size=len(tokenized_validation))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.eval()
test_batch_logits = []
y_true = []
for batch_i, batch in enumerate(eval_dataloader):
    with torch.no_grad():
        # copy input to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().detach().numpy()

        # output = model(**batch)
        output = model(input_ids, attention_mask=attention_mask)
        test_batch_logits.append(output.logits)
        y_true.extend(labels)

In [None]:
print(len(test_batch_logits),len(eval_dataloader))
test_logits = torch.cat(test_batch_logits, dim=0)

#sanity check -> dimension 0 of your logits tensor should be same as the size of the test dataset
print(test_logits.shape,len(tokenized_validation),len(y_true))

In [None]:
#Convert the logits to predicted labels
y_pred = torch.argmax(test_logits, dim = 1).cpu().numpy()
print(y_true[:10])
print(y_pred[:10])

#sanity check: should have as many predictions as labels
assert len(y_pred)==len(y_true)

## F1 & Accuracy Scores

In [None]:
print('F1 Score (macro):', f1_score(y_true, y_pred, average='macro'))
print('F1 Score (weighted):', f1_score(y_true, y_pred, average='weighted'))
print('F1 Score (micro):', f1_score(y_true, y_pred, average='micro'))
print('Accuracy Score:', accuracy_score(y_true, y_pred))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, target_names=['Failed Compensation/Land Rights??? (0)',
                                                          'Environmental Impact??? (1)',
                                                          'Administrative??? (2)',
                                                          'Deforestation??? (3)',
                                                          'Labour Rights (4)',
                                                          'Illegal or Contaminated FFB (5)']))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, target_names=['0','1','2','3','4','5']))

# Predicting

In [None]:
model_name = "SpanBERT"

## Evaluate

In [None]:
# Load new data for classification
new_data_path = '/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/NEW_grievances_formatted.csv'
new_df = pd.read_csv(new_data_path)

print(f"Loaded {len(new_df)} entries for classification")
print("Columns in new data:", new_df.columns.tolist())

# Prepare the text data (assuming 'summary' column contains the text to classify)
# If your text column has a different name, change 'summary' below
text_column = 'Text'

# Convert to Dataset format
new_dataset = Dataset.from_pandas(new_df[[text_column]])

# Tokenize the new data
tokenized_new_data = new_dataset.map(
    lambda example: tokenizer(str(example[text_column]),  # Convert to string to handle NaN
                             padding="max_length",
                             truncation=True,
                             max_length=tokenizer_length)
)

# Set format to torch tensors
tokenized_new_data.set_format("torch")

# Create DataLoader for inference
inference_dataloader = DataLoader(tokenized_new_data, batch_size=batch_size)

# Run inference
model.eval()   # LOAD BEST MSODEL
all_predictions = []

with torch.no_grad():
    for batch in tqdm(inference_dataloader, desc="Classifying"):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Get model predictions
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Convert logits to predicted labels
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        all_predictions.extend(predictions)


# Read Classification CSV
results_df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/classified_grievances.csv')
results_df[f'{model_name}_label'] = all_predictions[:120]


print(f"\nClassification complete!")
print(f"Label distribution:")
# Add Label
print(results_df[f'{model_name}_label'].value_counts().sort_index())

output_path = '/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/classified_grievances.csv'
results_df.to_csv(output_path, index=False)

print("\nFirst 10 classified entries:")
print(results_df[['summary', f'{model_name}_label']].head(10))

In [None]:
results_df

In [None]:
label_map = {
    0: 'Failed Compensation/Land Rights',
    1: 'Environmental Impact',
    2: 'Administrative',
    3: 'Deforestation',
    4: 'Labour Rights',
    5: 'Illegal or Contaminated FFB'
}

results_df[f'{model_name}_topic'] = results_df[f'{model_name}_label'].map(label_map)
results_df

In [None]:
import matplotlib.pyplot as plt

topic_counts = results_df[f'{model_name}_topic'].value_counts()
print(topic_counts)

plt.figure(figsize=(10, 6))
topic_counts.plot(kind='bar', edgecolor='black')
plt.title('Number of Grievances per Topic')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
results_df[results_df[f'{model_name}_label'] == 0]

## Comparison between Manual Labels and Transforemers' labels

In [None]:
results_df

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Get counts
manual_counts = results_df['manual_label'].value_counts()
distilbert_counts = results_df['DistilBERT_label'].value_counts()
spanbert_counts = results_df['SpanBERT_label'].value_counts()
electra_counts = results_df['Electra_label'].value_counts()


combined_counts = pd.DataFrame({
    'Manual': manual_counts,
    'DistilBERT': distilbert_counts,
    'SpanBERT': spanbert_counts
}).fillna(0)

combined_counts = combined_counts.sort_index()

# Plot
combined_counts.plot(kind='bar', figsize=(12, 6), edgecolor='black')
plt.title('Number of Grievances per Topic: Manual vs DistilBERT vs SpanBERT')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Label')
plt.tight_layout()
plt.show()

## Accuracy and F1 for Testing Data

In [None]:
# --- Evaluation ---
true_labels = results_df['manual_label'].astype(int).values
pred_labels = np.array(all_predictions)

print("Evaluation on New Data:")
print("F1 Score (macro):", f1_score(true_labels, pred_labels, average='macro'))
print("F1 Score (weighted):", f1_score(true_labels, pred_labels, average='weighted'))
print("F1 Score (micro):", f1_score(true_labels, pred_labels, average='micro'))
print("Accuracy:", accuracy_score(true_labels, pred_labels))

# Classification Report

print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=[
    'Failed Compensation/Land Rights',
    'Environmental Impact',
    'Administrative',
    'Deforestation',
    'Labour Rights',
    'Illegal or Contaminated FFB'
]))

In [None]:
# Modify the location per model
model_location = "/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/Spanbert_Saved_Models/Spanbert best_model.pt"
checkpoint = torch.load(model_location)

# Recreate the model architecture
model = AutoModelForSequenceClassification.from_pretrained(
    "SpanBERT/spanbert-large-cased",
    num_labels=6
).to(device)

# Load the saved weights
model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)
model.eval()

In [None]:
for i in range(6):
  print(f"Length of results with manual label {i} is: {len(results_df[results_df['manual_label'] == i])}")

In [None]:
# Load the best model
best_model_path = f"{model_path}best_model.pt"
checkpoint = torch.load(best_model_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval() # Set the model to evaluation mode

# Create a DataLoader for the test set
test_dataloader = DataLoader(tokenized_test, batch_size=batch_size)

predictions = []
pk_values = [] # To store pk values

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Predicting on test data"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # Move pk to CPU and convert to numpy
        pk = batch['pk'].cpu().numpy()
        pk_values.extend(pk)


        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Apply sigmoid to get probabilities
        probs = torch.sigmoid(logits)
        predictions.extend(probs.cpu().numpy())

# Convert predictions to a DataFrame
prediction_df = pd.DataFrame(predictions, columns=[f'Topic_{i}_Prob' for i in range(6)])

# Add the original text and pk back to the prediction DataFrame
# We need to get the original text and pk from the test_df or tokenized_test dataset
# Since tokenized_test has 'pk' and we can get 'summary' from test_df using 'pk'
# Let's create a mapping from pk to summary from the original test_df
pk_to_text = test_df.set_index('pk')['summary'].to_dict()

# Add 'pk' and 'summary' to the prediction_df
prediction_df['pk'] = pk_values # Use the collected numpy array of pk values
prediction_df['summary'] = prediction_df['pk'].map(pk_to_text)

# Rearrange columns to have pk and summary first
cols = ['pk', 'summary'] + [col for col in prediction_df.columns if col not in ['pk', 'summary']]
prediction_df = prediction_df[cols]

display(prediction_df.head())

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Define the threshold for converting probabilities to binary labels
evaluation_threshold = 0.5

# Get the predicted probabilities for the topic columns from prediction_df
topic_prob_columns = [col for col in prediction_df.columns if col.startswith('Topic_') and col.endswith('_Prob')]
predicted_probs = prediction_df[topic_prob_columns].values

# Convert probabilities to binary predictions using the threshold
binary_predictions = (predicted_probs >= evaluation_threshold).astype(int)

# Get the true multi-hot encoded labels from test_df
# Ensure alignment with prediction_df based on 'pk'
# Assuming test_df contains 'pk' and 'Topic_0' through 'Topic_5'
true_labels_for_eval = pd.merge(prediction_df[['pk']], test_df[['pk'] + [f'Topic_{i}' for i in range(6)]], on='pk', how='left')

# Extract the true labels in the correct order
true_labels_values = true_labels_for_eval[[f'Topic_{i}' for i in range(6)]].values

# Create a DataFrame for binary predictions (optional, but requested)
binary_prediction_df = pd.DataFrame(binary_predictions, columns=[f'Topic_{i}' for i in range(6)])
# Add pk for potential merging or inspection
binary_prediction_df['pk'] = prediction_df['pk']
# Rearrange columns
cols = ['pk'] + [col for col in binary_prediction_df.columns if col != 'pk']
binary_prediction_df = binary_prediction_df[cols]

# Display the head of the binary predictions DataFrame
print("Binary Predictions (first 5 rows):")
display(binary_prediction_df.head(20))

# Calculate Accuracy
accuracy = accuracy_score(true_labels_values, binary_predictions)

# Calculate F1 Score (using micro average for multi-label)
f1 = f1_score(true_labels_values, binary_predictions, average='micro')

print(f"\nEvaluation Results (Threshold = {evaluation_threshold}):")
print(f"Accuracy: {accuracy:.4f}")
print(f"Micro F1 Score: {f1:.4f}")

# Note: You can easily change the 'evaluation_threshold' variable above and re-run this cell
# to see the impact on accuracy and F1 score.

In [None]:
# Calculate the number of correctly classified topics for each topic
correctly_classified_counts = {}
total_instances_per_topic = np.sum(true_labels_values, axis=0) # Total true instances for each topic

print("\nEvaluation per Topic (Threshold =", evaluation_threshold, "):")
print("------------------------------------------------------------")

for i in range(6):
    topic_col_name = f'Topic_{i}'
    # Count where binary prediction matches true label for the current topic
    correct_predictions_for_topic = np.sum(binary_predictions[:, i] == true_labels_values[:, i])
    correctly_classified_counts[topic_col_name] = correct_predictions_for_topic

    # Calculate percentage of correctly classified instances for this topic
    # This is the overall accuracy for this specific binary classification task
    total_instances = len(true_labels_values) # Total number of documents
    percentage_correct = (correct_predictions_for_topic / total_instances) * 100

    # Calculate F1 Score for the current label
    f1 = f1_score(true_labels_values[:, i], binary_predictions[:, i])


    print(f"{topic_col_name}: Correctly Classified = {correct_predictions_for_topic}, Total Instances = {total_instances}, Percentage Correct = {percentage_correct:.2f}%, F1 Score = {f1:.4f}")

print("------------------------------------------------------------")

# Create a bar chart to visualize the percentage of correctly classified instances per topic
topics = list(correctly_classified_counts.keys())
# Recalculate percentages for plotting as it was not explicitly stored
percentages = [(correctly_classified_counts[topic] / len(true_labels_values)) * 100 for topic in topics]


plt.figure(figsize=(10, 6))
plt.bar(topics, percentages, color='skyblue')
plt.title('Percentage of Correctly Classified Topics on Test Data')
plt.xlabel('Topic')
plt.ylabel('Percentage Correctly Classified')
plt.ylim(0, 100) # Set y-axis limit to 100%
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()