In [None]:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

import torch
import torch.nn as nn
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Once data has finished processing, load data from folder structure
# text-data/
# ├─ advertisement/
# ├─ email/
# ├─ invoice/
# ....
data = load_files('../text-data', encoding='utf-8', decode_error='ignore')

X = data.data               
y = data.target             
class_names = data.target_names 

# split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def model_init(trial):
    "creates a new RoBERTa model each run"
    return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels = len(class_names))


# gives the options of value for each run -- to find the best on
def hyperparameters(trial)

    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 10),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.1),
    }

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# convert the data to string
def convert_byte_to_string(data):
    list_text = []
    for text in data:
        if isinstance(text, bytes):
            list_text.append(text.decode('utf-8'))
        else:
            #not byte, must be a string
            list_text.append(text)
    return list_text

# Tokenize with map function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

X_train_convert = convert_byte_to_string(X_train)
X_test_convert = convert_byte_to_string(X_test)

train_data = Dataset.from_dict({
    'text': X_train_convert,
    'labels': y_train.tolist()
})

test_data = Dataset.from_dict({
    'text': X_test_convert,
    'labels': y_test.tolist()
})

train_dataset = train_data.map(tokenize_function, batched=True)
test_dataset = test_data.map(tokenize_function, batched=True)

Map: 100%|██████████| 4000/4000 [00:05<00:00, 758.92 examples/s]
Map: 100%|██████████| 1000/1000 [00:01<00:00, 691.28 examples/s]


In [None]:
# training location
output_dir = '../models/RoBERTa-data'  

training_args = TrainingArguments(
    output_dir=output_dir,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='no',
    load_best_model_at_end=True,
    logging_dir=f'{output_dir}/logs',
    logging_steps=10,
    report_to='none'
)

# Use with Trainer directly
roberta_trainer = Trainer(
    model=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer = tokenizer
)

best_roberta_trial = roberta_trainer.hyperparameter_search(
    direction="maximize",
    hp_space=hyperparameters,
    n_trials=15,
)



Epoch,Training Loss,Validation Loss
1,1.1798,1.009929
2,0.8765,0.809657
3,0.669,0.804893
4,0.3815,0.746269
5,0.2923,0.803384
6,0.153,0.857463




SafetensorError: Error while serializing: I/O error: No space left on device (os error 28)

In [None]:
# final training arguments with the best hyperparameters
# the values in each variable are a fallback incase something goes wrong
final_training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=best_roberta_trial.hyperparameters.get('num_train_epochs', 10),
    per_device_train_batch_size=best_roberta_trial.hyperparameters.get('per_device_train_batch_size', 16),
    per_device_eval_batch_size=best_roberta_trial.hyperparameters.get('per_device_train_batch_size', 16),
    learning_rate=best_roberta_trial.hyperparameters.get('learning_rate', 2e-5),
    weight_decay=best_roberta_trial.hyperparameters.get('weight_decay', .001),
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    logging_dir=f'{output_dir}/final_roberta_model/logs',
    logging_steps=10,
    report_to='none',
    save_total_limit=2,
)

# used as final run
final_roberta_model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", 
    num_labels=len(class_names)
)

best_roberta_trainer = Trainer(
    model=final_roberta_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# training
best_roberta_trainer.train()

# Save the final model
best_roberta_trainer.save_model(f'{output_dir}/final_roberta_model')
tokenizer.save_pretrained(f'{output_dir}/final_roberta_model')

In [None]:
# metrics
predictions = roberta_trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids
accuracy = (preds == labels).mean()

print(f"Accuracy: {accuracy*100}")

In [None]:
# # evaluate model
final_roberta_model.eval()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# graphs
plt.figure(figsize=(16, 12))
cm = confusion_matrix(labels, preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title(f'Confusion Matrix - RoBERTa Model\nAccuracy: {accuracy*100:.2f}%')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('../models/RoBERTa-data/confusion_matrix.png', dpi=300)
plt.show()