In [1]:
!pip install -qqq transformers datasets evaluate

In [2]:
import warnings
warnings.simplefilter("ignore")
import numpy as np 
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
import torch
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
data_path = '/kaggle/input/clickbait-detection-msci641-s23'

In [4]:
train = pd.read_json(f'{data_path}/train.jsonl', lines=True)
test = pd.read_json(f'{data_path}/test.jsonl', lines=True)
val = pd.read_json(f'{data_path}/val.jsonl', lines=True)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True)

In [7]:
id2label = {0: 'passage', 1: 'phrase', 2: 'multi'}
label2id = {'passage': 0, 'phrase': 1, 'multi': 2}

In [8]:
def preprocess_data(df, test=False):
    ret = []
    if test:
        for _, i in df.iterrows():
            platform = i['postPlatform']
            post = ' '.join(i['postText'])
            title = i['targetTitle']
            description = i['targetDescription']
            paragraph = ' '.join(i['targetParagraphs'])
            keyword = i['targetKeywords']
            text = f"{platform} Post: {post} Website Title: {title} Website Description: {description} Website Paragraph: {paragraph} Website Keyword: {keyword}"
            ret += [{'text': text, 'id': i['id']}]
            ret_df = pd.DataFrame(ret)
        
    else:
        for _, i in df.iterrows():
            platform = i['postPlatform']
            post = ' '.join(i['postText'])
            title = i['targetTitle']
            description = i['targetDescription']
            paragraph = ' '.join(i['targetParagraphs'])
            keyword = i['targetKeywords']
            text = f"{platform} Post: {post} Website Title: {title} Website Description: {description} Website Paragraph: {paragraph} Website Keyword: {keyword}"
            ret += [{'text': text, 'labels': i['tags'][0]}]
            ret_df = pd.DataFrame(ret)
            ret_df['labels'] = ret_df['labels'].apply(lambda x: label2id[x])
    
    data = Dataset.from_pandas(ret_df)
    tokenized_data = data.map(tokenize, batched=True)
    return tokenized_data

In [9]:
train_df = preprocess_data(train)
val_df = preprocess_data(val)
test_df = preprocess_data(test, test=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
import evaluate
f1 = evaluate.load("f1", average='macro')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-large", num_labels=3, id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should 

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
training_args = TrainingArguments(
    output_dir="model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    #gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    fp16=True,
    num_train_epochs=50,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=val_df,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
10,1.1732,1.142088,0.288052
20,1.1209,1.097181,0.192171
30,1.0888,1.061802,0.192171
40,1.045,1.069235,0.192171


In [None]:
pred = trainer.predict(test_df).predictions
pred_ids = np.argmax(pred, 1)
test['spoilerType'] = pred_ids
test['spoilerType'] = test['spoilerType'].apply(lambda x: id2label[x])
submissions = test[['id', 'spoilerType']]

In [None]:
submissions.to_csv('submissions.csv', index=False)