In [1]:
# !pip install demoji

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


OLID data

In [3]:
data_olid = pd.read_csv('olid_data/olid-training-v1.0.tsv', sep='\t')
data_olid = data_olid[['tweet', 'subtask_a']]

In [4]:
data_olid['subtask_a'].value_counts()

subtask_a
NOT    8840
OFF    4400
Name: count, dtype: int64

In [5]:
data_olid = data_olid.rename(columns={"tweet": "text", "subtask_a": "label"})

In [6]:
import demoji
import re

def handle_emoji(x):
    x = demoji.replace_with_desc(x)
    return re.sub(r":", " ", x)

data_olid["text"] = data_olid["text"].apply(lambda x: x.strip("'"))
data_olid["text"] = data_olid["text"].apply(lambda x: handle_emoji(x))
data_olid["text"] = data_olid["text"].str.replace(r"@[A-Za-z0-9_]+", "", regex=True)
data_olid["text"] = data_olid["text"].str.replace(r"\w*\d\w*", "", regex=True)
data_olid["text"] = data_olid["text"].str.replace("\shttps?\s", "", regex=True)

In [7]:
data_olid['label'] = data_olid['label'].replace('OFF',1)
data_olid['label'] = data_olid['label'].replace('NOT',0)

In [8]:
data_olid

Unnamed: 0,text,label
0,She should ask a few native Americans what th...,1
1,Go home you’re drunk!!! #MAGA # oncoming f...,1
2,Amazon is investigating Chinese employees who ...,0
3,"Someone should'veTaken"" this piece of shit to...",1
4,Obama wanted liberals &amp; illegals to move...,0
...,...,...
13235,Sometimes I get strong vibes from people and ...,1
13236,Benidorm check mark button Creamfields che...,0
13237,And why report this garbage. We don't give a...,1
13238,Pussy,1


MBIC data

In [9]:
data_mbic = pd.read_csv('MBIC/labeled_dataset_mbic.csv')

In [10]:
data_mbic = data_mbic[['sentence', 'Label_bias']]
data_mbic = data_mbic.rename(columns={"sentence": "text", "Label_bias": "label"})

In [11]:
data_mbic['label'] = data_mbic['label'].replace('Biased',1)
data_mbic['label'] = data_mbic['label'].replace('Non-biased',0)

In [12]:
data_mbic.drop(data_mbic.index[data_mbic['label'] == 'No agreement'], inplace = True)

In [13]:
data_mbic['label'].value_counts()

label
1    1018
0     533
Name: count, dtype: int64

MD_gender_bias

In [14]:
data_md_gen = pd.read_csv('MD_gender/my-dataset-train.csv')

In [15]:
data_md_gen = data_md_gen[['text', 'label']]
data_md_gen

Unnamed: 0,text,label
0,"wow , four sisters . just watching game of thr...",1
1,that's nice . moms are pretty cool too .,1
2,i'm asian and have no hair .,1
3,i'm great enjoying the football season,1
4,"lol , i can imagine . i'll be reading a lot wh...",1
...,...,...
27663,wow you must be an expert now,0
27664,"whistle ? , no never have been a whistler , ca...",0
27665,me too ! did you also like the new ghostbusters ?,0
27666,"i respect that . i could not do that , i do no...",0


In [16]:
print('MBIC')
print('Shape', data_mbic.shape)
print(data_mbic.label.value_counts())

MBIC
Shape (1551, 2)
label
1    1018
0     533
Name: count, dtype: int64


In [17]:
print('OLID')
print('Shape', data_olid.shape)
print(data_olid.label.value_counts())

OLID
Shape (13240, 2)
label
0    8840
1    4400
Name: count, dtype: int64


In [18]:
print('data_md_gen')
print('Shape', data_md_gen.shape)
print(data_md_gen.label.value_counts())

data_md_gen
Shape (27668, 2)
label
1    13834
0    13834
Name: count, dtype: int64


In [19]:
data = pd.concat([data_mbic, data_md_gen, data_olid])
data.shape

(42459, 2)

In [20]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(data, test_size=0.1, random_state=42, stratify=data['label'])
# # train, test = train_test_split(train, test_size=0.2, random_state=42, stratify=train['label'])

In [21]:
print(train.shape, val.shape)

(38213, 2) (4246, 2)


In [22]:
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(val)


In [23]:
data_dict = DatasetDict()
data_dict['train'] = tds
data_dict['validation'] = vds
# # data_dict['test'] = ttds

In [24]:
data_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 38213
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 4246
    })
})

In [25]:
# !huggingface-cli login --token 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

In [26]:
# data_dict.push_to_hub('pranjali97/Bias-detection-combined')

In [27]:
from transformers import (RobertaTokenizerFast,RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)

In [28]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

In [29]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

In [30]:
train_dataset = data_dict['train'].map(tokenize, batched=True, batch_size=len(data_dict['train']))
val_dataset = data_dict['validation'].map(tokenize, batched=True, batch_size=len(data_dict['train']))


                                                                                                                                          

In [31]:
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 38213
})

In [32]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [33]:
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels = 2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [34]:
import numpy as np
import evaluate

acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    result = {}
    result.update(acc_metric.compute(predictions=predictions, references=labels))
    result.update(f1_metric.compute(predictions=predictions, references=labels, average = "macro"))
    return result

In [35]:
from transformers import TrainingArguments, EarlyStoppingCallback, IntervalStrategy

In [36]:
# TrainingArguments
training_args = TrainingArguments(
    output_dir='bias_detection1',
    num_train_epochs=20,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model = 'f1',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
) 

In [37]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 38213
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 23900
  Number of trainable parameters = 124647170


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3749,0.357124,0.843853,0.843587
2,0.2542,0.391576,0.855629,0.85556
3,0.2652,0.341503,0.8561,0.854573
4,0.1291,0.475545,0.861752,0.860689
5,0.1283,0.474542,0.86293,0.861549
6,0.1758,0.504318,0.862223,0.861275
7,0.0893,0.652053,0.857042,0.8565
7,0.1132,0.722262,0.853274,0.852846


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4246
  Batch size = 32
Saving model checkpoint to bias_detection1/checkpoint-1195
Configuration saved in bias_detection1/checkpoint-1195/config.json
Model weights saved in bias_detection1/checkpoint-1195/pytorch_model.bin
Deleting older checkpoint [bias_detection1/checkpoint-4777] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num 

KeyboardInterrupt: 

In [38]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, __index_level_0__. If text, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4246
  Batch size = 32


{'eval_loss': 0.7222622632980347,
 'eval_accuracy': 0.8532736693358455,
 'eval_f1': 0.8528456295381499}

In [48]:
test_set = pd.read_csv('annotated_test.csv')

test_set = test_set[['data', 'Unnamed: 1']]
test_set = test_set.rename(columns={"data": "text", "Unnamed: 1": "manual_label"})

In [49]:
texts = test_set['text'].tolist()

In [58]:
model1 = model.to('cpu') 
def inference_func(text):
    encoding = tokenizer(text, return_tensors="pt")
    outputs = model1(**encoding)
    prediction = outputs.logits.argmax(-1)
    return int(prediction)

In [59]:
predictions = []
for text in texts:
    predictions.append(inference_func(text))

In [60]:
predictions

[1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0]

In [61]:
test_set['model_preds'] = predictions

In [63]:
test_set.to_csv('annotation_results_combined.csv')