In [2]:
!pip install evaluate --quiet

[0m

In [36]:
from datasets import load_dataset
from nltk import sent_tokenize, word_tokenize
import random
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
import os
import torch
import evaluate
import numpy as np

In [4]:
data = load_dataset('onestop_english')

Downloading builder script:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading and preparing dataset onestop_english/default (download: 1.17 MiB, generated: 2.17 MiB, post-processed: Unknown size, total: 3.34 MiB) to /root/.cache/huggingface/datasets/onestop_english/default/1.1.0/6b19eec5680862ad1cf1990e98b06a98d1fa4c85f3585dc4dfab93f52b89d9cf...


Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/567 [00:00<?, ? examples/s]

Dataset onestop_english downloaded and prepared to /root/.cache/huggingface/datasets/onestop_english/default/1.1.0/6b19eec5680862ad1cf1990e98b06a98d1fa4c85f3585dc4dfab93f52b89d9cf. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 567
    })
})

In [37]:
sentences_hard = []
sentences_simple = []
for i in range(len(data['train'])):
    if data['train'][i]['label'] == 2:
        for sentence in sent_tokenize(data['train'][i]['text']):
            sentences_hard.append(' '.join(word_tokenize(sentence.lower())))
    elif data['train'][i]['label'] == 0:
        for sentence in sent_tokenize(data['train'][i]['text']):
            sentences_simple.append(' '.join(word_tokenize(sentence.lower())))

In [7]:
random.shuffle(sentences_hard)
random.shuffle(sentences_simple)

In [40]:
sentences_hard_train = sentences_hard[1001:len(sentences_hard)]
sentences_hard_test = sentences_hard[0:1000]

sentences_simple_train = sentences_simple[1001:len(sentences_simple)]
sentences_simple_test = sentences_simple[0:1000]

In [9]:
def collect_dataset(sentences_hard, sentences_simple):
    dataset = []
    for sentence_hard in sentences_hard:
        dataset.append({'text': sentence_hard, 'label': 1})
    for sentence_simple in sentences_simple:
        dataset.append({'text': sentence_simple, 'label': 0})
    random.shuffle(dataset)
    return dataset

In [41]:
train_dataset = pd.DataFrame.from_dict(collect_dataset(sentences_hard_train, sentences_simple_train))
test_dataset = pd.DataFrame.from_dict(collect_dataset(sentences_hard_test, sentences_simple_test))

In [42]:
datasets_train_test = DatasetDict({
    "train": Dataset.from_pandas(train_dataset),
    "test": Dataset.from_pandas(test_dataset)
    })

In [15]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device('cuda:0')
device = torch.device('cpu')

In [16]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [17]:
model.to(device);

In [18]:
def tokenization(text):
    return tokenizer(text["text"], padding='max_length', truncation=True)

In [43]:
datasets_train_test = datasets_train_test.map(tokenization, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [44]:
datasets_train_test

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10109
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [45]:
datasets_train_test = datasets_train_test.class_encode_column('label')
datasets_train_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Stringifying the column:   0%|          | 0/11 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/11 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

Stringifying the column:   0%|          | 0/2 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/2 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [28]:
clf_metrics = evaluate.combine(["accuracy"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [46]:
training_args = TrainingArguments(
    output_dir='results',
    save_total_limit=2,
    load_best_model_at_end=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    warmup_steps=10
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [47]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets_train_test['train'],
    eval_dataset=datasets_train_test['test'],
    compute_metrics = compute_metrics
)

In [31]:
os.environ["WANDB_DISABLED"] = "true"

In [48]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10109
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1264
  Number of trainable parameters = 124647170


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5041,0.454365,0.8105
2,0.4506,0.385208,0.834


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
Saving model checkpoint to results/checkpoint-632
Configuration saved in results/checkpoint-632/config.json
Model weights saved in results/checkpoint-632/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32
Saving model checkpoint to results/checkpoint-1264
Configuration saved in results/checkpoint-1264/config.json
Model weights saved in results/checkpoint-1264/p

TrainOutput(global_step=1264, training_loss=0.4576151702977434, metrics={'train_runtime': 1891.6068, 'train_samples_per_second': 10.688, 'train_steps_per_second': 0.668, 'total_flos': 5319579317268480.0, 'train_loss': 0.4576151702977434, 'epoch': 2.0})