# Summary source prediction: Model finetuning

Sébastien Meyer

In [1]:
import numpy as np
import pandas as pd

from datasets import load_dataset, load_metric

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

from sentence_transformers import CrossEncoder

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

## Load the data set

In [2]:
df = load_dataset("csv", data_files="data/train_set.csv")

Using custom data configuration default-0b9c419e93425e40
Reusing dataset csv (/home/sebastien/.cache/huggingface/datasets/csv/default-0b9c419e93425e40/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

## Initialize the model

These are huggingface models that have been tested with our data set:

- distilbert-base-uncased-finetuned-sst-2-english
- microsoft/xtremedistil-l6-h384-uncased 
- roberta-base-openai-detector 
- textattack/distilbert-base-cased-SST-2 
- textattack/distilbert-base-uncased-SST-2 

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
    # "distilbert-base-uncased-finetuned-sst-2-english",
    # "roberta-base-openai-detector",
    # "baykenney/bert-base-gpt2detector-random",
    # "textattack/distilbert-base-uncased-SST-2",
    "huwendeng/distilroberta_b",
    cache_dir="data/transformers_cache"
)

def tokenize_function(examples):
    return tokenizer(examples["summary"], padding="max_length", truncation=True)

tokenized_df = df.map(tokenize_function, batched=True)

Loading cached processed dataset at /home/sebastien/.cache/huggingface/datasets/csv/default-0b9c419e93425e40/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-938d8004706f71b3.arrow


In [4]:
tokenized_df_train = tokenized_df["train"].shuffle(seed=42).select(range(7000))
tokenized_df_eval = tokenized_df["train"].shuffle(seed=42).select(range(7001, 8000))

Loading cached shuffled indices for dataset at /home/sebastien/.cache/huggingface/datasets/csv/default-0b9c419e93425e40/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-5ffb12dee33fed2a.arrow
Loading cached shuffled indices for dataset at /home/sebastien/.cache/huggingface/datasets/csv/default-0b9c419e93425e40/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-5ffb12dee33fed2a.arrow


In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    # "distilbert-base-uncased-finetuned-sst-2-english",
    # "roberta-base-openai-detector",
    # "baykenney/bert-base-gpt2detector-random",
    # "textattack/distilbert-base-uncased-SST-2",
    "huwendeng/distilroberta_b",
    cache_dir="data/transformers_cache"
)

In [6]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## Train the model

Do not hesitate to tweak the training parameters to achieve better scores!

In [7]:
training_args = TrainingArguments(
    output_dir="data/test_trainer", 
    num_train_epochs=5,
    evaluation_strategy="steps",
    weight_decay=0.01,             
    load_best_model_at_end=True,
    logging_steps=-1,
    eval_steps=500
)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_df_train,
    eval_dataset=tokenized_df_eval,
    compute_metrics=compute_metrics,
)

In [9]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: summary, id, document. If summary, id, document are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4375


Step,Training Loss,Validation Loss,Accuracy
500,1.711,0.804055,0.845846
1000,0.0549,0.366867,0.873874
1500,0.0604,0.27778,0.87988
2000,0.5841,0.353295,0.886887
2500,0.0026,0.44794,0.897898
3000,0.0014,0.660449,0.882883
3500,0.2368,0.685001,0.88989
4000,0.0013,0.836651,0.885886


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: summary, id, document. If summary, id, document are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 999
  Batch size = 8
Saving model checkpoint to data/test_trainer/checkpoint-500
Configuration saved in data/test_trainer/checkpoint-500/config.json
Model weights saved in data/test_trainer/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: summary, id, document. If summary, id, document are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 999
  Batch size = 8
Saving model checkpoint to data/test_trainer/checkpoint-1000
C

TrainOutput(global_step=4375, training_loss=0.33573853277868143, metrics={'train_runtime': 1761.8925, 'train_samples_per_second': 19.865, 'train_steps_per_second': 2.483, 'total_flos': 4636358952960000.0, 'train_loss': 0.33573853277868143, 'epoch': 5.0})