In [None]:
"""
Code to fine-tune a bert model on hatexplain dataset
I/p: Sentence, Explanation, Keywords (Keywords mixed from Hugchat and Annotators)
O/p: Hate-Speech Classification
To do: Change from bert-cased to uncased
"""


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers[torch]
!pip install datasets
!pip install evaluate

Collecting transformers[torch]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.2-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m

In [3]:
from datasets import load_dataset
import ast
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy
import numpy as np
import evaluate
import torch
import os
import pandas as pd
os.environ["WANDB_DISABLED"] = "true"

In [4]:
### Input
train_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_train.csv'
val_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_val.csv'
test_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_test.csv'

### Output
train_processed_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_train_processed.csv'
val_processed_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_val_processed.csv'
test_processed_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/df_test_processed.csv'
model_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/bert_model_v3.pt'

In [5]:
def load_data(path):
    df = pd.read_csv(path)
    return df

def save_data(df, path):
    df.to_csv(path, index=False)

In [6]:
df_train = load_data(train_path)
df_train.head()

Unnamed: 0,id,unmasked_sentence,hugchat_keywords_processed,hugchat_explanation,mix_keywords,gt_comm_target,gt_label
0,23107796_gab,u really think i would not have been raped by ...,"['muslim', 'nazi', 'neo', 'raped', 'rape', 'fe...",the language used in the sentence suggests an...,"['raped', 'feral', 'hindu', 'muslim', 'neo', '...","['Islam', 'Other', 'Hindu']",offensive
1,9995600_gab,the uk has threatened to return radioactive wa...,"['migrants', 'waste', 'radioactive', 'paki']",this statement contains derogatory language t...,"['send', 'back', 'all', 'the', 'paki', 'migran...","['Islam', 'Indian', 'Minority', 'Refugee']",offensive
2,1227920812235051008_twitter,if english is not imposition then hindi is als...,"['imposition', 'hindi', 'shut', 'retards', 'ch...",this sentence contains several derogatory ter...,"['chutiya', 'retards', 'stophindiimposition']","['Other', 'Hindu']",offensive
3,1204931715778543624_twitter,no liberal congratulated hindu refugees post c...,"['refugees', 'cab', 'hindus', 'liberal', 'hind...",the statement seems factually correct as ther...,"['hate', 'hindus']",['Hindu'],offensive
4,1179102559241244672_twitter,he said bro even your texts sound redneck what...,"['redneck', 'bro']",this sentence appears to be casual conversati...,['redneck'],"['Caucasian', 'Economic']",offensive


In [7]:
def preprocess(in_path, out_path):
    df = load_data(in_path)
    df_copy = df.copy()
    df_copy['mix_keywords'] = df_copy['mix_keywords'].apply(str).apply(lambda x: ' '.join(k for k in ast.literal_eval(x)))
    df_copy['gt_comm_target'] = df_copy['gt_comm_target'].apply(lambda x: ' '.join(k for k in x))
    df_copy['bert_ip1'] = df_copy.apply(lambda row: row['unmasked_sentence'] + '.' + row['hugchat_explanation'], axis=1)
    keywords_prefix = " keywords suggesting this explanation are: "
    df_copy['bert_ip2'] = df_copy.apply(lambda row: row['unmasked_sentence'] + '.' + row['hugchat_explanation'] + keywords_prefix + row['mix_keywords'], axis=1)
    df_copy['gt_label'] = df_copy['gt_label'].map({'hate':0, 'offensive':2, 'normal':1})
    df_copy = df_copy[['bert_ip1','bert_ip2','gt_label']].copy()
    df_copy.rename(columns={'gt_label':'label'}, inplace=True)
    save_data(df_copy, out_path)
    return df_copy


In [8]:
def tokenize_data(example):
    return tokenizer(example['bert_ip2'], padding='max_length')

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [10]:
df_train = preprocess(train_path, train_processed_path)
df_val = preprocess(val_path, val_processed_path)
df_test = preprocess(test_path, test_processed_path)

In [11]:
dataset = load_dataset('csv', data_files={'train': train_processed_path,
                                          'val' : val_processed_path,
                                          'test': test_processed_path}, encoding = "ISO-8859-1")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-fea8c0e18adbecd5/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-fea8c0e18adbecd5/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [19]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
dataset = dataset.map(tokenize_data, batched=True)
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)

Map:   0%|          | 0/14057 [00:00<?, ? examples/s]

Map:   0%|          | 0/1786 [00:00<?, ? examples/s]

Map:   0%|          | 0/1759 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [20]:
metric = evaluate.load("accuracy")
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/",
    num_train_epochs=10,
    learning_rate=2e-5,
    evaluation_strategy = IntervalStrategy.STEPS, # "steps"
    eval_steps = 50, # Evaluation and Save happens every 50 steps
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted
    weight_decay=0.01,
    load_best_model_at_end=True)

train_dataset = dataset['train']
val_dataset = dataset['val']

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=10)]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [21]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
50,No log,0.901068,0.591825
100,No log,0.779526,0.643897
150,No log,0.679401,0.707167
200,No log,0.592388,0.762598
250,No log,0.759728,0.713886
300,No log,0.572454,0.761478
350,No log,0.584191,0.743561
400,No log,0.542753,0.796193
450,No log,0.546469,0.783875
500,0.691600,0.578035,0.779955


TrainOutput(global_step=3800, training_loss=0.46125347338224715, metrics={'train_runtime': 2634.2015, 'train_samples_per_second': 53.363, 'train_steps_per_second': 6.674, 'total_flos': 7994964311083008.0, 'train_loss': 0.46125347338224715, 'epoch': 2.16})

In [22]:
model_path = '/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/bert_model_v3.pt'
torch.save(model.state_dict(),model_path)
import pickle
test_dataset = dataset['test']
predictions = trainer.predict(test_dataset)
with open('/content/drive/MyDrive/cs4nlp/CS4NLP-HateXplain/data/bert_modeling/predictions_try3.pkl', 'wb') as handle:
  pickle.dump(predictions, handle)
print(predictions)

PredictionOutput(predictions=array([[-1.9229037 ,  2.1349223 , -0.40406778],
       [-1.2776855 , -1.2274743 ,  2.6747532 ],
       [ 1.8586452 , -1.7376754 , -0.56135714],
       ...,
       [-1.0297472 , -1.3865654 ,  2.612711  ],
       [ 2.105703  , -2.0974367 , -0.42289534],
       [ 1.8762162 , -2.3682    ,  0.17175215]], dtype=float32), label_ids=array([1, 2, 0, ..., 2, 0, 0]), metrics={'test_loss': 0.45313817262649536, 'test_accuracy': 0.8146674246731097, 'test_runtime': 19.6802, 'test_samples_per_second': 89.379, 'test_steps_per_second': 11.179})


In [23]:
predictions[2]

{'test_loss': 0.45313817262649536,
 'test_accuracy': 0.8146674246731097,
 'test_runtime': 19.6802,
 'test_samples_per_second': 89.379,
 'test_steps_per_second': 11.179}