In [None]:
import os
import re
import string
import wandb
import evaluate
import numpy as np
import pandas as pd
from datasets import Dataset
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments, EarlyStoppingCallback)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mhsl023[0m ([33mhsl023-uc-san-diego[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
df = pd.read_csv("data.csv")
print(df.head())

                                            headline  clickbait
0  !Sdrawkcab: Missy Elliott, the Beatles and the...          0
1  "Apprentice" contestant sues Trump for defamation          0
2  "Big morale boost": George H.W. Bush tweets im...          0
3  "Bring it on": Students sue Trump administrati...          0
4  "God made me bulletproof," oft-shot rapper Yun...          0


## Preprocessing
### Text cleaning
- Lowercase
- Remove URLs, punctuation, stopwords if necessary
- Optionally apply lemmatization/stemming

In [4]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    return text
df['headline'] = df['headline'].apply(clean_text)
df

Unnamed: 0,headline,clickbait
0,sdrawkcab missy elliott the beatles and the jo...,0
1,apprentice contestant sues trump for defamation,0
2,big morale boost george hw bush tweets image w...,0
3,bring it on students sue trump administration ...,0
4,god made me bulletproof oftshot rapper yung ma...,0
...,...,...
53024,flip or flop to end in following hosts split,0
53025,groundhog day broadway musical giving away t...,0
53026,scientific racism is on the rise on the righ...,0
53027,the walking dead star to play the punisher i...,0


We will preprocess the dataset using natural language processing techniques. The major preprocessing steps include text cleaning and tokenization, where we would like to convert text to lowercase and remove punctuation, special characters, and digits. However, since special characters such as exclamation marks and question marks are usually used in clickbait titles, we should take these into consideration as they can be impactful indicators. In addition, in the tokenization step, we will split headlines into tokens using the same tokenizer as the target language model, such as BERTTokenizer. Then, the data will be split into train, test, and validation sets. The exact percentage of each set will be determined later. 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df['headline'], df['clickbait'], test_size=0.2, random_state=42, stratify=df['clickbait']
)
train_df = pd.DataFrame({'headline': X_train, 'clickbait': y_train})
test_df = pd.DataFrame({'headline': X_test, 'clickbait': y_test})
small_train = train_df[:1000]
small_eval = test_df[:1000]

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["headline"], truncation=True, padding="max_length", max_length=128)

train_ds = Dataset.from_pandas(small_train[["headline","clickbait"]]).map(tokenize, batched=True)
test_ds  = Dataset.from_pandas(small_eval[["headline","clickbait"]]).map(tokenize, batched=True)
train_ds = train_ds.rename_column("clickbait", "labels")
test_ds  = test_ds.rename_column("clickbait", "labels")
train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
test_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label={0:"not_clickbait", 1:"clickbait"},
    label2id={"not_clickbait":0, "clickbait":1},
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
metric = evaluate.load("accuracy")
    
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # returns precision, recall, f1 for each label
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"  # use "macro" if multiclass
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [20]:
cfg={                 # <= put your hyperparameters here
        "run_name": "test-run",
        "max_length": 128,
        "learning_rate": 1e-5,
        "batch_train": 16,
        "batch_eval": 32,
        "num_epochs": 3,
        "weight_decay": 0.01,
        "seed": 42
    }

In [23]:
training_args = TrainingArguments(
    output_dir="outputs",
    logging_strategy="steps",
    logging_steps=50,       
    eval_strategy="epoch",
    report_to="wandb",
    run_name=cfg["run_name"],
    learning_rate=cfg["learning_rate"],
    per_device_train_batch_size=cfg["batch_train"],
    per_device_eval_batch_size=cfg["batch_eval"],
    num_train_epochs=cfg["num_epochs"],
    weight_decay=cfg["weight_decay"],
    dataloader_pin_memory=False
)

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.005,0.870529,0.89,0.869919,0.83812,0.853723
2,0.001,0.890669,0.895,0.892655,0.825065,0.857531
3,0.0001,0.89925,0.894,0.890141,0.825065,0.856369


TrainOutput(global_step=189, training_loss=0.0016301620391902154, metrics={'train_runtime': 105.6081, 'train_samples_per_second': 28.407, 'train_steps_per_second': 1.79, 'total_flos': 99350548992000.0, 'train_loss': 0.0016301620391902154, 'epoch': 3.0})

In [None]:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
wandb.finish()



{'eval_loss': 0.41008061170578003, 'eval_accuracy': 0.897, 'eval_runtime': 8.2833, 'eval_samples_per_second': 120.726, 'eval_steps_per_second': 15.091, 'epoch': 3.0}
