In [1]:
!pip install -qqq transformers datasets evaluate

In [2]:
import warnings
warnings.simplefilter("ignore")
import numpy as np 
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset

In [3]:
data_path = '/kaggle/input/clickbait-detection-msci641-s23'

In [4]:
train = pd.read_json(f'{data_path}/train.jsonl', lines=True)
test = pd.read_json(f'{data_path}/test.jsonl', lines=True)
val = pd.read_json(f'{data_path}/val.jsonl', lines=True)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True)

In [7]:
id2label = {0: 'passage', 1: 'phrase', 2: 'multi'}
label2id = {'passage': 0, 'phrase': 1, 'multi': 2}

In [8]:
def preprocess_data(df, test=False):
    ret = []
    if test:
        for _, i in df.iterrows():
            ret += [{'text': ' '.join(i['postText']) + ' - ' + i['targetTitle'] + ' ' + ' '.join(i['targetParagraphs']), 'id': i['id']}]
            ret_df = pd.DataFrame(ret)
        
    else:
        for _, i in df.iterrows():
            ret += [{'text': ' '.join(i['postText']) + ' - ' + i['targetTitle'] + ' ' + ' '.join(i['targetParagraphs']), 'labels': i['tags'][0]}]
            ret_df = pd.DataFrame(ret)
            ret_df['labels'] = ret_df['labels'].apply(lambda x: label2id[x])
    
    data = Dataset.from_pandas(ret_df)
    tokenized_data = data.map(tokenize, batched=True)
    return tokenized_data

In [9]:
train_df = preprocess_data(train)
val_df = preprocess_data(val)
test_df = preprocess_data(test, test=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
import evaluate
f1 = evaluate.load("f1", average='macro')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
training_args = TrainingArguments(
    output_dir="model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=val_df,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1
50,1.0395,1.035813,0.378252
100,0.986,0.974648,0.487063
150,0.9399,0.930674,0.515496
200,0.8652,0.853406,0.607029
250,0.7851,0.836279,0.630371
300,0.8024,0.793515,0.667045
350,0.7055,0.77584,0.67301
400,0.7416,0.750707,0.688792
450,0.586,0.800649,0.654615
500,0.543,0.788656,0.67693


TrainOutput(global_step=750, training_loss=0.6809373346964518, metrics={'train_runtime': 439.5418, 'train_samples_per_second': 364.015, 'train_steps_per_second': 22.751, 'total_flos': 1589637132288000.0, 'train_loss': 0.6809373346964518, 'epoch': 3.75})

In [16]:
pred = trainer.predict(test_df).predictions
pred_ids = np.argmax(pred, 1)
test['spoilerType'] = pred_ids
test['spoilerType'] = test['spoilerType'].apply(lambda x: id2label[x])
submissions = test[['id', 'spoilerType']]

In [17]:
submissions.to_csv('submissions.csv', index=False)