In [1]:
%pip install huggingface-hub
from huggingface_hub import notebook_login
notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
%%capture
%pip install transformers
%pip install datasets
%pip install evaluate
%pip install torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import os

In [15]:
dataset = load_dataset("json", data_files={"train": "/content/train_aws.jsonl", "test": "/content/validation_aws.jsonl"})
dataset = dataset.map(
    lambda examples: {
        "input": examples["prompt"],
        "output": examples["completion"],
    },
    remove_columns=["prompt", "completion"],
)
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 453
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 114
    })
})

In [16]:
checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

mapDict = {
    "No Hate Speech": 0,
    "Hate Speech": 1,
}

def transform_labels(label):
    label = label["output"]
    result = []
    for l in label:
        result.append(mapDict[l])
    return {"label": result}


def tokenize_function(example):
    return tokenizer(example["input"], padding=True, truncation=True)

In [17]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(transform_labels, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 453
    })
    test: Dataset({
        features: ['input', 'output', 'input_ids', 'token_type_ids', 'attention_mask', 'label'],
        num_rows: 114
    })
})

In [18]:
from transformers import TrainingArguments

output_dir = "./bert-hate-speech-test"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="./logs.log",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=200,
    save_total_limit=2,
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
)

In [19]:
from transformers import Trainer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

os.environ["WANDB_DISABLE"] = "true"
os.environ["WANDB_MODE"] = "offline"

In [20]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metric(eval_perd):
  logits, labels = eval_perd
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [21]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

  trainer = Trainer(


In [22]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=171, training_loss=0.6061805145085206, metrics={'train_runtime': 1223.2163, 'train_samples_per_second': 1.111, 'train_steps_per_second': 0.14, 'total_flos': 47489916326328.0, 'train_loss': 0.6061805145085206, 'epoch': 3.0})

In [23]:
trainer.evaluate()

{'eval_loss': 0.7912555932998657,
 'eval_accuracy': 0.6403508771929824,
 'eval_runtime': 47.4482,
 'eval_samples_per_second': 2.403,
 'eval_steps_per_second': 0.316,
 'epoch': 3.0}

In [24]:
trainer.save_model()

In [25]:
trainer.push_to_hub("vitorhugoclz/modelhate")

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/vitorhugoclz/bert-hate-speech-test/commit/1f2106c1330d88297a82378bb6397625b383e3f7', commit_message='vitorhugoclz/modelhate', commit_description='', oid='1f2106c1330d88297a82378bb6397625b383e3f7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/vitorhugoclz/bert-hate-speech-test', endpoint='https://huggingface.co', repo_type='model', repo_id='vitorhugoclz/bert-hate-speech-test'), pr_revision=None, pr_num=None)

In [26]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="vitorhugoclz/bert-hate-speech-test")

config.json:   0%|          | 0.00/881 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [27]:
pipe("Que legal! Eu sou uma cotista da flip de um delta 36 em Paraty. Recomendo!")

[{'label': 'LABEL_0', 'score': 0.8343678116798401}]