# **BERT Text Classification for Media Bias Detection**


In [None]:
!pip install transformers datasets evaluate

In [None]:
!huggingface-cli login

In [None]:
# Changed for testing each train/validation split
VAL_SPLIT = 0.1

In [None]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import string
regular_punct = list(string.punctuation)

def remove_punctuation(text, punct_list):
  for punc in punct_list:
    if punc in text:
      text = text.replace(punc, ' ')
    return text.strip()

dataset = load_dataset("csv", data_files="dataset_train_val.csv")

texts = dataset["train"]["text"]
for text in texts :
  text = remove_punctuation(text, regular_punct)

labels = dataset["train"]["label"]

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=VAL_SPLIT)

In [None]:
print(train_labels[0])
print(train_texts[0])

In [None]:
dataset = load_dataset("csv", data_files="dataset_test.csv")

test_texts = dataset["train"]["text"]
for text in test_texts :
  text = remove_punctuation(text, regular_punct)

text_labels = dataset["train"]["label"]

In [None]:
print("Train Dataset Size:", len(train_texts))
print("Validation Dataset Size:", len(val_texts))
print("Test Dataset Size:", len(test_texts))

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3
)

# Tuned value for each train/val split
model.config.dropout = 0.5

print(model)

In [None]:
from datasets import load_metric
import numpy as np

metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=5e-6,
    num_train_epochs=15,
    weight_decay=0.1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = Dataset(test_encodings, text_labels)

In [None]:
print(trainer.evaluate(train_dataset))
print(trainer.evaluate(val_dataset))
print(trainer.evaluate(test_dataset))

In [None]:
!pip install sigopt

In [None]:
def sigopt_hp_space(trial):
    return [
        {
            "bounds": {"min": 1e-6, "max": 1e-4},
            "name": "learning_rate",
            "type": "double"
        },
        {
            "categorical_values": ["1", "2", "4"],
            "name": "per_device_train_batch_size",
            "type": "categorical"
        },
        {
            "bounds": {"min": 0.01, "max": 0.1},
            "name": "weight_decay",
            "type": "double"
        }
    ]

In [None]:
def model_init(trial):
  return AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3
  )

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig

trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    model_init=model_init
)

In [None]:
import os
from sigopt import Connection
# Need to export SigOpt account client token here to environment variable "SIGOPT_API_TOKEN" before creating and running project
conn = Connection(client_token="INSERT_API_TOKEN")
os.environ['SIGOPT_API_TOKEN'] = "INSERT_API_TOKEN"
!sigopt create project --project 'huggingface'

In [None]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="sigopt",
    hp_space=sigopt_hp_space,
    n_trials=5,
)