In [1]:
import torch
torch.cuda.is_available()
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import evaluate
import accelerate
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


In [2]:
train_dataset = Dataset.from_file(r"processed_dataset\train\data-00000-of-00001.arrow")
train_dataset[:5]

{'id': ['0000997932d777bf',
  '000103f0d9cfb60f',
  '000113f07ec002fd',
  '0001b41b1c6bb37e',
  '0001d958c54c6e35'],
 'comment_text': ["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
  "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
  "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
  '"\nMore\nI can\'t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact 

In [3]:
test_dataset = Dataset.from_file(r"processed_dataset\test\data-00000-of-00001.arrow")
test_dataset[0]

{'id': '0001ea8717f6de06',
 'comment_text': 'Thank you for understanding. I think very highly of you and would not revert without discussion.',
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0,
 'cyberbullying': 0}

In [13]:
label_map = {'toxic':0, 'severe_toxic':1, 'obscene':2, 'threat':3, 'insult':4, 'identity_hate':5, 'cyberbullying':6}

In [None]:
from transformers import AutoTokenizer

# 1. Choose a BERT model and tokenizer
model_name = "bert-base-uncased"  # Or any other BERT variant
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. Define a function to tokenize the text and encode labels
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["comment_text"], truncation=True, padding="max_length", max_length=128) #adjust max length as needed
    # Map labels to numerical IDs
    labels = [label_map[label] for label in examples["label"]]
    tokenized_inputs["labels"] = labels # Add labels to the dictionary
    return tokenized_inputs

# 3. Apply the preprocessing function to the dataset
tokenized_datasets = train_dataset.map(preprocess_function, batched=True)

# 4. Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["id", "comment_text",])

# 5. Rename the label column to labels (required by some Transformers models)
# tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# 6. Set the format to PyTorch tensors (or TensorFlow, if you prefer)
tokenized_datasets.set_format("torch")

print(tokenized_datasets)


Map:   0%|          | 0/159571 [00:00<?, ? examples/s]


KeyError: 'label'

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np

# 1. Load the BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, hidden_dropout_prob=0.1)
model.resize_token_embeddings(len(tokenizer)) # need to resize due to new tokens added

# 2. Define training arguments
metric_name = 'f1'
# model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"./snapshots/{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    fp16=True
)

# 3. Define a metric to compute during training
metric = evaluate.load(metric_name)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="micro")

# 4. Create a Trainer instance
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 5. Train the model
train_log = trainer.train()

trainer.save_model("./models/myFinetunedModel") # for saving your model


In [None]:
# Finally, we perform our evaluation on our test set using the fine-tuned model from earlier.
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda:0")
results = classifier(dataset['test']['text'], max_length=max_len, padding="max_length", truncation=True)
dfResults = pd.DataFrame.from_dict(results)
dfResults['label'] = dfResults['label'].str.replace('LABEL_','')
f1 = metric.compute(predictions=dfResults['label'].tolist(), references=dataset['test']['label'], average='micro')
print(f1)