# Pre-Trained Word Embeddings for Text Classification

# Imports/Installations

In [78]:
!pip install transformers
!pip install -U datasets



In [79]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [80]:
from collections import defaultdict, Counter
import json
import numpy as np
import torch
import pandas as pd

from matplotlib import pyplot as plt

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset, DatasetDict
from torch.utils.data import DataLoader
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertModel
from transformers import get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from torch.optim import AdamW
from transformers import set_seed
from sklearn.metrics import accuracy_score, f1_score
from datasets import load_dataset, DatasetDict, Dataset
from torch.utils.data import DataLoader

# Loading and formatting CSVs

In [81]:
# OLD 60/20/20 DATA SPLIT

# df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_results.csv')
# test = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_TEST.csv')
# train = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_TRAIN.csv')
# validation = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_VAL.csv')

# # turning into hugging face format
# test = Dataset.from_pandas(test)
# train = Dataset.from_pandas(train)
# validation = Dataset.from_pandas(validation)

In [82]:
df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_results.csv')
# test = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/Labeled Data/BERTopic_TEST.csv')
train = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/Classification Data/Base/BERTopic_TRAIN_80.csv')
validation = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/Classification Data/Base/BERTopic_VAL_20.csv')

# turning into hugging face format
# test = Dataset.from_pandas(test)
train = Dataset.from_pandas(train)
validation = Dataset.from_pandas(validation)

In [83]:
word_counts = df['Text'].astype(str).apply(lambda x: len(x.split()))

max_words = word_counts.max()
min_words = word_counts.min()
mean_words = word_counts.mean()

print(f"Max words in summary: {max_words}")
print(f"Min words in summary: {min_words}")
print(f"Mean words in summary: {mean_words}")


Max words in summary: 677
Min words in summary: 3
Mean words in summary: 40.064599483204134


# Dataset Tokenization

## Example Tokenization

In [84]:
name = "distilbert-base-cased"  # Remove the "distilbert/" prefix
tokenizer = DistilBertTokenizer.from_pretrained(name)

sample_input = "We want to use a pretrained tokenizer."
tokenized_inputs = tokenizer(sample_input,
                             return_tensors="pt",
                             padding=True,
                             truncation=True,
                             max_length=512)
print(tokenized_inputs["input_ids"])

tensor([[  101,  1284,  1328,  1106,  1329,   170,  3073,  4487,  9044, 22559,
         17260,   119,   102]])


We will use the function that we use to test the tokenizer on a single input. We will also split our data into batches of 128.

In [85]:
# NOT SURE WHAT MAX LENGTH TO USE?

## Tokenized Train Dataset

In [86]:
tokenizer_length = 512

In [87]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")

# Apply tokenization using map
tokenized_train = train.map(
    lambda example: tokenizer(example['Text'],
                             padding="max_length",
                             truncation=True,
                             max_length=tokenizer_length)  # not sure what length to use
)

# Remove the original text column (we don't need it after tokenization)
tokenized_train = tokenized_train.remove_columns(['Text'])

# Rename 'Dominant_Topic' to 'labels' (standard for transformers)
tokenized_train = tokenized_train.rename_column("Dominant_Topic", "labels")

# Step 4: Set format to torch tensors
tokenized_train.set_format("torch")

# Check the results
print("Tokenized dataset features:", tokenized_train.column_names)
print("Dataset size:", len(tokenized_train))
print("\nSample data shapes:")
#print(f"- input_ids: {tokenized_train[0]['input_ids'].shape}")
print(f"- attention_mask: {tokenized_train[0]['attention_mask'].shape}")
print(f"- labels: {tokenized_train[0]['labels']}")
print(f"- labels type: {type(tokenized_train[0]['labels'])}")

Map:   0%|          | 0/309 [00:00<?, ? examples/s]

Tokenized dataset features: ['labels', 'input_ids', 'attention_mask']
Dataset size: 309

Sample data shapes:
- attention_mask: torch.Size([512])
- labels: 3
- labels type: <class 'torch.Tensor'>


In [88]:
tokenized_train['labels'][5]

tensor(0)

## Tokenized Test Dataset

In [89]:
# # Apply the same process to your test set
# tokenized_test = test.map(
#     lambda example: tokenizer(example['Text'],
#                              padding="max_length",
#                              truncation=True,
#                              max_length=tokenizer_length)
# )

# tokenized_test = tokenized_test.remove_columns(['Text'])
# tokenized_test = tokenized_test.rename_column("Dominant_Topic", "labels")
# tokenized_test.set_format("torch")

# print("Test dataset size:", len(tokenized_test))

In [90]:
# tokenized_test['labels']

## Tokenized Validation Dataset

In [91]:
# Apply the same process to your test set
tokenized_validation = validation.map(
    lambda example: tokenizer(example['Text'],
                             padding="max_length",
                             truncation=True,
                             max_length=tokenizer_length)
)

tokenized_validation = tokenized_validation.remove_columns(['Text'])
tokenized_validation = tokenized_validation.rename_column("Dominant_Topic", "labels")
tokenized_validation.set_format("torch")

print("Test dataset size:", len(tokenized_validation))

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

Test dataset size: 78


## Check Tokenization on train and test

In [92]:
#lets check our tokenization for a few samples
tokenized_train[0:2]

{'labels': tensor([3, 3]),
 'input_ids': tensor([[ 101, 2448, 3186,  ...,    0,    0,    0],
         [ 101,  129, 1260,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [93]:
#lets check our tokenization for a few samples
# tokenized_test[0:2]

In [94]:
tokenized_validation[0:2]

{'labels': tensor([0, 1]),
 'input_ids': tensor([[  101,  1672,  3756,  ...,     0,     0,     0],
         [  101, 16370,  4033,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

# Using DataLoader to Batchify data
Make sure to send your datasets to the Dataloader in order to segment your dataset into batches. Remember that we need batches to run our iterative optimization procedure which is typically some form of Mini-batch Gradient Descent.

In the interest of time we want to finetune our model on a sample of the training set with 309 records instead of the entire 62K sample size

## batch_size

In [95]:
batch_size = 8

In [96]:
train_dataset = tokenized_train.shuffle(seed=1111).select(range(305))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
eval_dataloader = DataLoader(tokenized_validation, batch_size=batch_size)

# Training and Validation
We have now gone through all the required preprocessing to prep the data for training. Instead of the Trainer module, it will be a good practice to initially write our own training loops so that we are mindful of all the steps that required for training neural networks.

Other than our training and validation data we need to select:

An optimizer to run backpropagation
A scheduler that sets a protocol for parameter updates at the end of a batch
We would also like to set a seed at the start of computation. This ensures that we are able to generate reproducicble results across multiple training sessions.

We run validation at the end of each epoch.

**At the end of this step we want to report the best validation loss obtained during training. We also want to save the model corresponding to the epoch that reported the best validation loss.**

In [97]:
model_path = '/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/DistilBERT_Saved_Models/DistilBERT '

In [98]:
from torch.nn.utils import clip_grad_norm_
set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#model = DistilBertForSequenceClassification.from_pretrained(name, num_labels=6).to(device)
# Load config first, modify it, then load model
config = DistilBertConfig.from_pretrained(name)
config.dropout = 0.1  # This is the valid parameter for DistilBert
config.seq_classif_dropout = 0.1  # Classifier dropout
config.num_labels = 6

model = DistilBertForSequenceClassification.from_pretrained(name, config=config).to(device)

num_epochs = 20
# num_training_steps = len(train_dataloader)
# INSTEAD OF ABOVE USE THIS
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.005, eps=1e-6)  # Halved LR
#optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

num_training_steps = len(train_dataloader) * num_epochs
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.05 * num_training_steps),  # 10% warmup
    num_training_steps=num_training_steps
)
#lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# For Graph
# x_epochs = range(num_epochs)
x_epochs = []
y_train = []
y_val = []

best_val_loss = float("inf")

# Early Stopping (Prevents Overfitting)
early_stopping = True
early_count = 0 # +1 every time validation loss doesnt improve
early_limit = 5

progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    x_epochs.append(epoch)

    # training
    model.train()
    training_losses = []
    for batch_i, batch in enumerate(train_dataloader):

        optimizer.zero_grad()

        # copy input to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device, dtype=torch.long) # Convert labels to LongTensor and move to device

        # output = model(**batch)
        output = model(input_ids, attention_mask=attention_mask, labels=labels)
        training_loss = output.loss
        training_losses.append(training_loss.item())

        #backprop and update params by taking an optimization step
        # IMPROVING THE MODEL
        output.loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)  # ADD THIS LINE
        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)
    print(f"Epoch {epoch}")
    print("Mean Training Loss", np.mean(training_losses))
    y_train.append(np.mean(training_losses))

    # validation
    val_loss = 0
    model.eval() #important to call because we dont want to collect gradients
    for batch_i, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            # copy input to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device, dtype=torch.long) # Convert labels to LongTensor and move to device
            # output = model(**batch)
            output = model(input_ids, attention_mask=attention_mask, labels=labels)
        val_loss += output.loss

    avg_val_loss = val_loss / len(eval_dataloader)
    print(f"Validation loss: {avg_val_loss}")
    y_val.append(avg_val_loss.cpu())

    # print(f"Validation loss: {avg_val_loss}")
    if avg_val_loss < best_val_loss:
        print("Saving checkpoint!")
        early_count = 0 # Reset counter
        best_val_loss = avg_val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            # 'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': best_val_loss,
            },
            f"{model_path}best_model.pt"
        )
    elif early_stopping:
        early_count += 1

        if early_count == early_limit:
            print(f"Validation loss has not improved for {early_count} iterations; Early Stopping.")
            break
    print()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
plt.plot(x_epochs, y_val)
plt.plot(x_epochs, y_train)
plt.style.use('fivethirtyeight')
plt.title("Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Validation Loss")

In [None]:
# Modify the location per model
model_location = "/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/DistilBERT_Saved_Models/DistilBERT best_model.pt"
checkpoint = torch.load(model_location)
# Recreate the model architecture
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-cased",
    num_labels=6
)
# Recreate the model architecture
#model = AutoModelForSequenceClassification.from_pretrained(
#    "distilbert-base-cased",
#    num_labels=6
#)
# Load the saved weights
model.load_state_dict(checkpoint["model_state_dict"])
model.to(device)
model.eval()

#### **Evaluate your model on Test Data**

Now we use our finetuned model to evaluate the test set. We use performance metrics from `sklearn.metrics` to test the effectiveness of our model on unseen test data.

In order to do that, run the finetuned model you have just saved on your test data and report the following performance metrics:



*   Accuracy
*   F1 Score

In [None]:
eval_dataloader = DataLoader(tokenized_validation, batch_size=len(tokenized_validation))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.eval()
test_batch_logits = []
y_true = []
for batch_i, batch in enumerate(eval_dataloader):
    with torch.no_grad():
        # copy input to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].cpu().detach().numpy()
        # output = model(**batch)
        output = model(input_ids, attention_mask=attention_mask)
        test_batch_logits.append(output.logits)
        y_true.extend(labels)

In [None]:
print(len(test_batch_logits),len(eval_dataloader))
test_logits = torch.cat(test_batch_logits, dim=0)

#sanity check -> dimension 0 of your logits tensor should be same as the size of the test dataset
print(test_logits.shape,len(tokenized_validation),len(y_true))

In [None]:
#Convert the logits to predicted labels
y_pred = torch.argmax(test_logits, dim = 1).cpu().numpy()
print(y_true[:20])
print(y_pred[:20])

#sanity check: should have as many predictions as labels
assert len(y_pred)==len(y_true)

## F1 & Accuracy Scores

In [None]:
print('F1 Score (macro):', f1_score(y_true, y_pred, average='macro'))
print('F1 Score (weighted):', f1_score(y_true, y_pred, average='weighted'))
print('F1 Score (micro):', f1_score(y_true, y_pred, average='micro'))
print('Accuracy Score:', accuracy_score(y_true, y_pred))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, target_names=['0','1','2','3','4','5']))

# Predicting

In [None]:
model_name = "DistilBERT"

In [None]:
# Load new data for classification
new_data_path = '/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Data/NEW_grievances_formatted.csv'
new_df = pd.read_csv(new_data_path)

print(f"Loaded {len(new_df)} entries for classification")
print("Columns in new data:", new_df.columns.tolist())

# Prepare the text data (assuming 'summary' column contains the text to classify)
# If your text column has a different name, change 'summary' below
text_column = 'Text'

# Convert to Dataset format
new_dataset = Dataset.from_pandas(new_df[[text_column]])

# Tokenize the new data
tokenized_new_data = new_dataset.map(
    lambda example: tokenizer(str(example[text_column]),  # Convert to string to handle NaN
                             padding="max_length",
                             truncation=True,
                             max_length=tokenizer_length)
)

# Set format to torch tensors
tokenized_new_data.set_format("torch")

# Create DataLoader for inference
inference_dataloader = DataLoader(tokenized_new_data, batch_size=batch_size)

# Run inference
model.eval()   # LOAD BEST MSODEL
all_predictions = []

with torch.no_grad():
    for batch in tqdm(inference_dataloader, desc="Classifying"):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Get model predictions
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Convert logits to predicted labels
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        all_predictions.extend(predictions)


# Read Classification CSV
results_df = pd.read_csv('/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/classified_grievances.csv')
results_df[f'{model_name}_label'] = all_predictions


print(f"\nClassification complete!")
print(f"Label distribution:")
# Add Label
print(results_df[f'{model_name}_label'].value_counts().sort_index())

output_path = '/content/gdrive/MyDrive/Group 3: palm oil topic classifier/Text Classification Models/classified_grievances.csv'
results_df.to_csv(output_path, index=False)

print("\nFirst 10 classified entries:")
print(results_df[['summary', f'{model_name}_label']].head(10))

In [None]:
results_df

In [None]:
label_map = {
    0: 'Failed Compensation/Land Rights',
    1: 'Environmental Impact',
    2: 'Administrative',
    3: 'Deforestation',
    4: 'Labour Rights',
    5: 'Illegal or Contaminated FFB'
}

results_df[f'{model_name}_topic'] = results_df[f'{model_name}_label'].map(label_map)
results_df

In [None]:
import matplotlib.pyplot as plt

topic_counts = results_df['manual_label'].value_counts()
print(topic_counts)

plt.figure(figsize=(10, 6))
topic_counts.plot(kind='bar', edgecolor='black')
plt.title('Number of Grievances per Topic')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Comparison between Manual Labels and Transforemers' labels

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Get counts
manual_counts = results_df['manual_label'].value_counts()
distilbert_counts = results_df['DistilBERT_label'].value_counts()
spanbert_counts = results_df['SpanBERT_label'].value_counts()
electra_counts = results_df['Electra_label'].value_counts()


combined_counts = pd.DataFrame({
    'Manual': manual_counts,
    'DistilBERT': distilbert_counts,
    'SpanBERT': spanbert_counts
}).fillna(0)

combined_counts = combined_counts.sort_index()

# Plot
combined_counts.plot(kind='bar', figsize=(12, 6), edgecolor='black')
plt.title('Number of Grievances per Topic: Manual vs DistilBERT vs SpanBERT')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Label')
plt.tight_layout()
plt.show()

In [None]:
# --- Evaluation ---
true_labels = results_df['manual_label'].astype(int).values
pred_labels = np.array(all_predictions)

print("\nEvaluation on New Data:")
print("F1 Score (macro):", f1_score(true_labels, pred_labels, average='macro'))
print("F1 Score (weighted):", f1_score(true_labels, pred_labels, average='weighted'))
print("F1 Score (micro):", f1_score(true_labels, pred_labels, average='micro'))
print("Accuracy:", accuracy_score(true_labels, pred_labels))

# Classification Report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=[
    'Failed Compensation/Land Rights',
    'Environmental Impact',
    'Administrative',
    'Deforestation',
    'Labour Rights',
    'Illegal or Contaminated FFB'
]))

In [None]:
len(results_df[results_df['manual_label'] == 1])

In [None]:
len(results_df[results_df['manual_label'] == 0])

In [None]:
len(results_df[results_df['manual_label'] == 2])

In [None]:
len(results_df[results_df['manual_label'] == 4])

In [None]:
len(results_df[results_df['manual_label'] == 5])