In [3]:
!pip install evaluate datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Using cached evaluate-0.4.0-py3-none-any.whl (81 kB)
Collecting datasets
  Using cached datasets-2.10.1-py3-none-any.whl (469 kB)
Collecting transformers
  Using cached transformers-4.27.3-py3-none-any.whl (6.8 MB)
Collecting dill
  Using cached dill-0.3.6-py3-none-any.whl (110 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.14-py39-none-any.whl (132 kB)
Collecting xxhash
  Using cached xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
Collecting responses<0.19
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub>=0.7.0
  Using cached huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
Collecting aiohttp
  Using cached aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2-cp39-cp39-manylinux_2_17

In [4]:
import torch
import numpy as np
import pickle
from tqdm.auto import tqdm, trange
import evaluate
from datasets import Dataset

In [5]:
from transformers import (AutoModelForSequenceClassification, 
                          AutoTokenizer, 
                          RobertaTokenizer, 
                          RobertaForSequenceClassification, 
                          TrainingArguments, 
                          Trainer)

In [6]:
import pandas as pd

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!unzip '/content/drive/MyDrive/bias_data.zip'

Archive:  /content/drive/MyDrive/bias_data.zip
   creating: bias_data/
   creating: bias_data/real_world_samples/
  inflating: bias_data/real_world_samples/news_left  
   creating: __MACOSX/
   creating: __MACOSX/bias_data/
   creating: __MACOSX/bias_data/real_world_samples/
  inflating: __MACOSX/bias_data/real_world_samples/._news_left  
  inflating: bias_data/real_world_samples/ibc_left  
  inflating: __MACOSX/bias_data/real_world_samples/._ibc_left  
  inflating: bias_data/real_world_samples/ibc_right  
  inflating: __MACOSX/bias_data/real_world_samples/._ibc_right  
  inflating: bias_data/real_world_samples/speeches  
  inflating: __MACOSX/bias_data/real_world_samples/._speeches  
  inflating: bias_data/real_world_samples/news_right  
  inflating: __MACOSX/bias_data/real_world_samples/._news_right  
  inflating: bias_data/real_world_samples/news_sensationalist_dramatic  
  inflating: __MACOSX/bias_data/real_world_samples/._news_sensationalist_dramatic  
  inflating: __MACOSX/bias_d

In [9]:
biased_corpus_path = '/content/bias_data/WNC/biased.full'

In [None]:
# !unzip '/content/drive/MyDrive/bias_data.zip'

In [20]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels = 2)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', max_length = 128)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [11]:
from sklearn.model_selection import train_test_split

In [None]:
# biased_corpus_path = '/kaggle/input/bias-data/bias_data/WNC/biased.full'
# # norm_corpus_path = '/kaggle/input/bias-data/bias_data/WNC/neutral'

In [12]:
colnames = ['id','src_tok','tgt_tok','src_raw',
            'tgt_raw','src_POS_tags','tgt_parse_tags']

In [13]:
df = pd.read_csv(biased_corpus_path, 
                        sep='\t', on_bad_lines='skip', names=colnames)

In [14]:
bias = []
for punkt in df['src_raw'][:100000]:
    bias.append({'text':punkt, 'label':1})
norm = []
for punkt in df['tgt_raw'][:100000]:
    norm.append({'text':punkt, 'label':0})

In [15]:
biased_df = pd.DataFrame(bias)
norm_df = pd.DataFrame(norm)

In [16]:
data = pd.concat([biased_df, norm_df])

In [17]:
data

Unnamed: 0,text,label
0,"during the campaign, controversy erupted over ...",1
1,nicaea was convoked by the emperor constantine...,1
2,it was rather unfortunate that he vehemently o...,1
3,dennis the menace is an american animated seri...,1
4,"today, on large farms, motorcycles, dogs or me...",1
...,...,...
99995,the permanent exhibition shows the main struct...,0
99996,sheyla tadeo (born sheyla osiris tadeo bringas...,0
99997,"constitutes ""child pornography"" remains debata...",0
99998,primitive people held that gods were subject t...,0


In [18]:
X_train, X_test, y_train, y_test = train_test_split(data['text'].tolist(), data['label'].tolist(), test_size=0.3, random_state=42, shuffle=True)

In [21]:
def prep(text, tokenizer=tokenizer):
    return tokenizer(text, padding = 'max_length', max_length = 128, truncation=True, return_tensors='pt')

In [22]:
X_train = prep(X_train)
X_test = prep(X_test)

In [23]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_test, y_test)

In [24]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [25]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=[np.argmax(np.asarray(i)) for i in labels])

In [26]:
training_args = TrainingArguments(output_dir='./results',
                                    save_total_limit=2,
                                    evaluation_strategy='epoch',
                                    save_strategy = 'epoch',
                                    num_train_epochs=2,
                                    per_device_train_batch_size=16,
                                    per_device_eval_batch_size=32,
                                    warmup_steps=10,
                                    load_best_model_at_end=True
                                 )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [27]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss,Accuracy
1,0.694,0.693743,1.0
2,0.6937,0.699115,1.0


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


TrainOutput(global_step=17500, training_loss=0.6946696341378348, metrics={'train_runtime': 7172.0918, 'train_samples_per_second': 39.04, 'train_steps_per_second': 2.44, 'total_flos': 1.84177738752e+16, 'train_loss': 0.6946696341378348, 'epoch': 2.0})

In [28]:
tokenizer.save_pretrained('roberta_bias_clf')
trainer.model.save_pretrained('roberta_bias_clf')