In [None]:
"""
Code to fine-tune a bert model on hatexplain dataset
I/p: Sentence, Explanation, Keywords (Maybe)
O/p: Hate-Speech Classification
"""


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install transformers[torch]
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.0 responses-0.18.0


In [5]:
from datasets import load_dataset
import ast
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy
import numpy as np
import evaluate
import torch
import os
import pandas as pd
os.environ["WANDB_DISABLED"] = "true"

In [6]:
### Input
train_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_train.csv'
val_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_val.csv'
test_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_test.csv'

### Output
train_processed_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_train_processed.csv'
val_processed_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_val_processed.csv'
test_processed_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_test_processed.csv'
model_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/bert_model.pt'

In [7]:
def load_data(path):
    df = pd.read_csv(path)
    return df

def save_data(df, path):
    df.to_csv(path, index=False)

In [8]:
def preprocess(in_path, out_path):
    df = load_data(in_path)
    df_copy = df.copy()
    df_copy['mix_keywords'] = df_copy['mix_keywords'].apply(str).apply(lambda x: ' '.join(k for k in ast.literal_eval(x)))
    df_copy['gt_comm_target'] = df_copy['gt_comm_target'].apply(lambda x: ' '.join(k for k in x))
    df_copy['bert_ip1'] = df_copy.apply(lambda row: row['unmasked_sentence'] + '.' + row['hugchat_explanation'], axis=1)
    keywords_prefix = " keywords suggesting this explanation are: "
    df_copy['bert_ip2'] = df_copy.apply(lambda row: row['unmasked_sentence'] + '.' + row['hugchat_explanation'] + keywords_prefix + row['mix_keywords'], axis=1)
    df_copy['gt_label'] = df_copy['gt_label'].map({'hate':0, 'offensive':2, 'normal':1})
    df_copy = df_copy[['bert_ip1','bert_ip2','gt_label']].copy()
    df_copy.rename(columns={'gt_label':'label'}, inplace=True)
    save_data(df_copy, out_path)
    return df_copy


In [9]:
def tokenize_data(example):
    return tokenizer(example['bert_ip1'], padding='max_length')

In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [11]:
df_train = preprocess(train_path, train_processed_path)
df_val = preprocess(val_path, val_processed_path)
df_test = preprocess(test_path, test_processed_path)

In [12]:
dataset = load_dataset('csv', data_files={'train': train_processed_path,
                                          'val' : val_processed_path,
                                          'test': test_processed_path}, encoding = "ISO-8859-1")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1bd84bef3dc7ef42/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1bd84bef3dc7ef42/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
dataset = dataset.map(tokenize_data, batched=True)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/14057 [00:00<?, ? examples/s]

Map:   0%|          | 0/1786 [00:00<?, ? examples/s]

Map:   0%|          | 0/1759 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [19]:
metric = evaluate.load("accuracy")
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/",
    num_train_epochs=10,
    learning_rate=2e-5,
    evaluation_strategy = IntervalStrategy.STEPS, # "steps"
    eval_steps = 50, # Evaluation and Save happens every 50 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted
    weight_decay=0.01,
    load_best_model_at_end=True)

train_dataset = dataset['train']
val_dataset = dataset['val']

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
50,No log,0.760786,0.669093
100,No log,0.753708,0.674692
150,No log,0.77679,0.663494
200,No log,0.795309,0.658455
250,No log,0.831774,0.656215
300,No log,0.794056,0.665174
350,No log,0.772271,0.665733
400,No log,0.894997,0.667413
450,No log,0.866378,0.671333
500,0.625300,0.957951,0.656215


TrainOutput(global_step=2250, training_loss=0.6357710978190104, metrics={'train_runtime': 1497.6333, 'train_samples_per_second': 93.861, 'train_steps_per_second': 11.739, 'total_flos': 4734199725179904.0, 'train_loss': 0.6357710978190104, 'epoch': 1.28})

In [21]:
trainer.evaluate()


{'eval_loss': 0.7391132116317749,
 'eval_accuracy': 0.6808510638297872,
 'eval_runtime': 19.6276,
 'eval_samples_per_second': 90.994,
 'eval_steps_per_second': 11.413,
 'epoch': 1.28}

In [22]:
torch.save(model.state_dict(),model_path)

In [25]:
import pickle
test_dataset = dataset['test']
predictions = trainer.predict(test_dataset)
with open('/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/predictions_try1.pkl', 'wb') as handle:
  pickle.dump(predictions, handle)
print(predictions)

PredictionOutput(predictions=array([[-0.67801493,  0.0557027 ,  0.27052096],
       [-1.1874441 , -1.1793572 ,  1.4180373 ],
       [ 2.0769153 , -1.0682856 , -0.47879332],
       ...,
       [-1.051589  , -1.2775222 ,  1.303146  ],
       [ 2.180442  , -1.210871  , -0.83561766],
       [-1.9870688 ,  1.0810469 ,  0.27455693]], dtype=float32), label_ids=array([1, 2, 0, ..., 2, 0, 0]), metrics={'test_loss': 0.7433924078941345, 'test_accuracy': 0.6657191586128482, 'test_runtime': 22.9043, 'test_samples_per_second': 76.798, 'test_steps_per_second': 9.605})


In [27]:
with open('/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/predictions_try1.pkl', 'rb') as handle:
  loaded_predictions = pickle.load(handle)

loaded_predictions

PredictionOutput(predictions=array([[-0.67801493,  0.0557027 ,  0.27052096],
       [-1.1874441 , -1.1793572 ,  1.4180373 ],
       [ 2.0769153 , -1.0682856 , -0.47879332],
       ...,
       [-1.051589  , -1.2775222 ,  1.303146  ],
       [ 2.180442  , -1.210871  , -0.83561766],
       [-1.9870688 ,  1.0810469 ,  0.27455693]], dtype=float32), label_ids=array([1, 2, 0, ..., 2, 0, 0]), metrics={'test_loss': 0.7433924078941345, 'test_accuracy': 0.6657191586128482, 'test_runtime': 22.9043, 'test_samples_per_second': 76.798, 'test_steps_per_second': 9.605})

In [31]:
predictions[2]

{'test_loss': 0.7433924078941345,
 'test_accuracy': 0.6657191586128482,
 'test_runtime': 22.9043,
 'test_samples_per_second': 76.798,
 'test_steps_per_second': 9.605}