In [1]:
!pip install -qqq transformers datasets evaluate

In [2]:
import warnings
warnings.simplefilter("ignore")
import numpy as np 
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset

In [3]:
data_path = '/kaggle/input/clickbait-detection-msci641-s23'

In [4]:
train = pd.read_json(f'{data_path}/train.jsonl', lines=True)
test = pd.read_json(f'{data_path}/test.jsonl', lines=True)
val = pd.read_json(f'{data_path}/val.jsonl', lines=True)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True)

In [7]:
id2label = {0: 'passage', 1: 'phrase', 2: 'multi'}
label2id = {'passage': 0, 'phrase': 1, 'multi': 2}

In [8]:
def preprocess_data(df, test=False):
    ret = []
    if test:
        for _, i in df.iterrows():
            ret += [{'text': ' '.join(i['postText']) + ' - ' + i['targetTitle'] + ' ' + ' '.join(i['targetParagraphs']), 'id': i['id']}]
            ret_df = pd.DataFrame(ret)
        
    else:
        for _, i in df.iterrows():
            ret += [{'text': ' '.join(i['postText']) + ' - ' + i['targetTitle'] + ' ' + ' '.join(i['targetParagraphs']), 'labels': i['tags'][0]}]
            ret_df = pd.DataFrame(ret)
            ret_df['labels'] = ret_df['labels'].apply(lambda x: label2id[x])
    
    data = Dataset.from_pandas(ret_df)
    tokenized_data = data.map(tokenize, batched=True)
    return tokenized_data

In [9]:
train_df = preprocess_data(train)
val_df = preprocess_data(val)
test_df = preprocess_data(test, test=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.we

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
training_args = TrainingArguments(
    output_dir="model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=val_df,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666940296666629, max=1.0)…

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.982282,0.53
2,No log,0.828408,0.645


TrainOutput(global_step=400, training_loss=0.9140534973144532, metrics={'train_runtime': 230.1563, 'train_samples_per_second': 27.807, 'train_steps_per_second': 1.738, 'total_flos': 847806470553600.0, 'train_loss': 0.9140534973144532, 'epoch': 2.0})

In [21]:
pred = trainer.predict(test_df).predictions
pred_ids = np.argmax(pred, 1)
test['spoilerType'] = pred_ids
test['spoilerType'] = test['spoilerType'].apply(lambda x: id2label[x])
submissions = test[['id', 'spoilerType']]

In [36]:
submissions.to_csv('submissions.csv', index=False)