### **Import Dataset**

- Importing the dataset from the hugging face repository.

In [None]:
from datasets import load_dataset

dataset = load_dataset("badmatr11x/hate-offensive-speech")

### **Tokenization Process**

- Fetch the pre-trained tokenizer from the repository.
- Apply map function on dataset and pass every tweets to tokenizer and get tokens of the tweet.

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

def preprocess_function(examples):
    return tokenizer(examples["tweet"], truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

### **Split the Dataset**

- Create the training and validation datasets and shuffle them for better and unbiased training.

In [None]:
train_dataset = tokenized_dataset["train"].shuffle(seed=107)#.select(range(1000))
validation_dataset = tokenized_dataset["validation"].shuffle(seed=107)#.select(range(100))

### **Add Padding to the dataset**

In [4]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### **Setup the wandb account**

- setting up the weights and bias account.
- This is the online tool to moniter the training and evaluation process.

In [5]:
### For use wandb, setup your account and paste your details here such as API key, username and project name.

import os
import wandb
os.environ["WANDB_API_KEY"]="api-key"
os.environ["WANDB_ENTITY"]="username"
os.environ["WANDB_PROJECT"]="projectname"

### **Create the log function to report all the evaluation data**

- Create the function to calculate f1-score, precision, recall and accuracy of the model.
- This function is call by the trainer object while evaluating the model and calculated data sends back to the wandb account.

In [6]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    output = {
        "overall-f1": f1_score(labels, predictions, average='micro'),
        "overall-accuracy": accuracy_score(labels, predictions),
        "overall-precision": precision_score(labels, predictions, average='micro'),
        "overall-recall": recall_score(labels, predictions, average='micro'),
    }

    return output

### **Label Mapping**

In [7]:
id2label = {0: "HATE-SPEECH", 1: "OFFENSIVE-LANGUAGE", 2: "NEITHER"}
label2id = {"HATE-SPEECH": 0, "OFFENSIVE-LANGUAGE": 1, "NEITHER": 2}

### **Fine Tuning the Model**

- Fetch the 'distilroberta-base' model and fine tune it for classification purposes.

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base", num_labels=3, id2label=id2label, label2id=label2id
)

### **Setting up the Training arguments and create the Trainer Object to train the Model**

# Training Arguments
- In here, set up the output directory for model, learning rate of model, train batch size, validation batch size, round of epochs, weight decay fields.

# Trainer Object
- Passed the model, datasets, computational matrics and arguments to the object.

In [9]:
training_args = TrainingArguments(
    output_dir="speech-multiclassifier-run-2",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    report_to="wandb", ### Remove this parameter if you don't have the wandb account setup
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

### **Call the Trainer Object and report data to wandb**

- Train the model and report evaluation data back to the wandb.

In [None]:
trainer.train()
wandb.finish()  ### Remove this if you don't have the wandb account setup

### **Fetch the test dataset**

In [None]:
test_dataset = tokenized_dataset["test"].shuffle(seed=23)

### **Create the testing Object from the initial trainer object**

In [None]:
tester = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### **Put Model into the evaluation mode**

In [None]:
model.eval()

### **Predict the tweets label of test_dataset**

In [None]:
trainer.predict(test_dataset)