In [None]:
import torch
torch.cuda.is_available()
import warnings
warnings.filterwarnings('ignore')
!pip install -U evaluate
!pip install -U datasets
!pip install -U accelerate
!pip install -U transformers

import numpy as np
import pandas as pd
import evaluate
import accelerate
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


In [2]:
dataset = load_dataset("socialmediaie/SocialMediaIE-MetaCorpus-v1", "abusive-founta")

print(dataset)

Generating test split: 100%|██████████| 11657/11657 [00:00<00:00, 1201390.81 examples/s]
Generating train split: 100%|██████████| 41961/41961 [00:00<00:00, 1904504.77 examples/s]
Generating validation split: 100%|██████████| 4663/4663 [00:00<00:00, 867973.17 examples/s]

DatasetDict({
    test: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 11657
    })
    train: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 41961
    })
    validation: Dataset({
        features: ['tweet_id', 'text', 'label'],
        num_rows: 4663
    })
})





In [5]:
unique_values = set(dataset['train']['label'])
for val in dataset['train']['label']:
    if val not in unique_values:
        print(f"Duplicate value found: {val}")

In [None]:
from transformers import AutoTokenizer

# 1. Choose a BERT model and tokenizer
model_name = "bert-base-uncased"  # Or any other BERT variant
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 2. Define a function to tokenize the text and encode labels
# @parameter: examples: a dictionary with keys "text" and "label"
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128) #adjust max length as needed
    # Map labels to numerical IDs
    label_map = {"normal": 0, "abusive": 1, "hateful": 2, "spam": 3}  # Define your label mapping
    labels = [label_map[label] for label in examples["label"]]
    tokenized_inputs["labels"] = labels # Add labels to the dictionary
    return tokenized_inputs

# 3. Apply the preprocessing function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 4. Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["tweet_id", "text", "label"])

# 5. Rename the label column to labels (required by some Transformers models)
# tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# 6. Set the format to PyTorch tensors (or TensorFlow, if you prefer)
tokenized_datasets.set_format("torch")

print(tokenized_datasets)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████████████████████████████████████████████████████████| 11657/11657 [00:03<00:00, 3322.90 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████| 41961/41961 [00:12<00:00, 3454.28 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 4663/4663 [00:01<00:00, 3505.54 examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11657
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 41961
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4663
    })
})





In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np

# 1. Load the BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, hidden_dropout_prob=0.1)
model.resize_token_embeddings(len(tokenizer)) # need to resize due to new tokens added

# 2. Define training arguments
metric_name = 'f1'
# model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"./snapshots/{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 3,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    fp16=True
)

# 3. Define a metric to compute during training
metric = evaluate.load(metric_name)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="micro")

# 4. Create a Trainer instance
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 5. Train the model
train_log = trainer.train()

trainer.save_model("./models/myFinetunedModel") # for saving your model


In [None]:
# Finally, we perform our evaluation on our test set using the fine-tuned model from earlier.
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda:0")
results = classifier(dataset['test']['text'], max_length=max_len, padding="max_length", truncation=True)
dfResults = pd.DataFrame.from_dict(results)
dfResults['label'] = dfResults['label'].str.replace('LABEL_','')
f1 = metric.compute(predictions=dfResults['label'].tolist(), references=dataset['test']['label'], average='micro')
print(f1)