In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

In [None]:
from google.colab import files
uploaded = files.upload()


Saving allsides_balanced_news_headlines-texts.csv to allsides_balanced_news_headlines-texts.csv


In [None]:
path = 'allsides_balanced_news_headlines-texts.csv'

In [None]:
df = pd.read_csv(path)
df.head()

Unnamed: 0.1,Unnamed: 0,title,tags,heading,source,text,bias_rating
0,0,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Chicago Gun Violence Spikes and Increasingly F...,New York Times (News),As Yasmin Miller drove home from a laundromat ...,left
1,1,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",‘Bullets just came from nowhere’: Fourth of Ju...,Chicago Tribune,As many Chicagoans were celebrating the Fourth...,center
2,2,Gun Violence Over Fourth of July Weekend,"['Protests', 'Fourth Of July', 'Gun Control An...",Dozens of shootings across US mark bloody July...,New York Post (News),The nation’s 4th of July weekend was marred by...,right
3,3,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Federal Government Will Run Out of Cash on Oct...,The Epoch Times,Treasury Secretary Janet Yellen on Tuesday war...,right
4,4,Yellen Warns Congress of 'Economic Recession' ...,"['Janet Yellen', 'Debt Ceiling', 'Economic Pol...",Yellen tells Congress that U.S. will run out o...,Washington Post,Treasury Secretary Janet Yellen on Tuesday tol...,left


In [None]:
df.drop(columns=['Unnamed: 0', 'tags', 'source'], inplace=True)

In [None]:
# drop duplicates
df.drop_duplicates(subset='text', inplace=True)

In [None]:
# Combine fields
df['content'] = (df['title'].fillna('') + ' ' + df['heading'].fillna('') + ' ' + df['text'].fillna(''))

In [None]:
df.drop(columns=['title', 'text','heading'], axis =1, inplace=True)

In [None]:
texts = df["content"].tolist()

In [None]:
labels = df["bias_rating"].map({"left":0, "center":1, "right":2}).tolist()

In [None]:
# Cleanlab
!pip install cleanlab



In [None]:
from cleanlab.filter import find_label_issues
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.utils import shuffle

In [None]:
texts, labels = shuffle(texts, labels, random_state=42)

In [None]:
import numpy as np
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts)
y = np.array(labels)


In [None]:
# cross-validated predicted probabilities
clf = LogisticRegression(max_iter=1000)
pred_probs = cross_val_predict(clf, X, y, cv=5, method="predict_proba")

In [None]:
# Label issues
issues = find_label_issues(labels=y, pred_probs=pred_probs)

In [None]:
# Clean samples
clean_texts = [texts[i] for i in range(len(texts)) if not issues[i]]
clean_labels = [labels[i] for i in range(len(labels)) if not issues[i]]

In [None]:
import random
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)


torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(clean_texts, clean_labels, test_size=0.2, random_state=42)


In [None]:
# tokenize
tokenizer = AutoTokenizer.from_pretrained("roberta-base")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

In [None]:
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels}).map(tokenize, batched=True)
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels}).map(tokenize, batched=True)


Map:   0%|          | 0/10687 [00:00<?, ? examples/s]

Map:   0%|          | 0/2672 [00:00<?, ? examples/s]

In [None]:
# model
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels =3)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# balace dataset
import numpy as np
class_weights = compute_class_weight("balanced", classes =np.array([0,1,2]), y=train_labels)
# convert numpy array to PyTorch tensor
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss



In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Training
training_args = TrainingArguments(
    output_dir="./bias_model_roberta",
    do_eval=True,

    learning_rate=2e-5,
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"
)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

Step,Training Loss
50,1.1077
100,1.0989
150,1.0952
200,1.0622
250,1.0386
300,0.9896
350,0.8928
400,0.8722
450,0.8577
500,0.8221


TrainOutput(global_step=2004, training_loss=0.6800706845795561, metrics={'train_runtime': 1528.4074, 'train_samples_per_second': 20.977, 'train_steps_per_second': 1.311, 'total_flos': 4217839642888704.0, 'train_loss': 0.6800706845795561, 'epoch': 3.0})

In [None]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.7272691130638123, 'eval_accuracy': 0.7406437125748503, 'eval_f1': 0.7417476279911442, 'eval_runtime': 34.5029, 'eval_samples_per_second': 77.443, 'eval_steps_per_second': 4.84, 'epoch': 3.0}


In [None]:

print("Training metrics:", trainer.evaluate(train_dataset))  # Training data


Training metrics: {'eval_loss': 0.3357989192008972, 'eval_accuracy': 0.8757368765790212, 'eval_f1': 0.8767408582082442, 'eval_runtime': 139.7494, 'eval_samples_per_second': 76.473, 'eval_steps_per_second': 4.78, 'epoch': 3.0}


In [None]:
predictions = trainer.predict(val_dataset)
print("Test metrics:", predictions.metrics)


Test metrics: {'test_loss': 0.7272691130638123, 'test_accuracy': 0.7406437125748503, 'test_f1': 0.7417476279911442, 'test_runtime': 34.8416, 'test_samples_per_second': 76.69, 'test_steps_per_second': 4.793}


In [None]:
import pickle
import os

save_dir = "./saved_bias_model"
os.makedirs(save_dir, exist_ok=True)

# Save model & tokenizer
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

preprocess_data = {
    "label2id": {"left": 0, "center": 1, "right": 2},
    "id2label": {0: "left", 1: "center", 2: "right"},
    "max_length": 256
}
with open(os.path.join(save_dir, "preprocessing.pkl"), "wb") as f:
    pickle.dump(preprocess_data, f)


In [None]:
import shutil
shutil.make_archive("bias_model_files", 'zip', save_dir)


'/content/bias_model_files.zip'

In [None]:
from google.colab import files
files.download("bias_model_files.zip")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>