In [None]:
import pandas as pd
import numpy as np
import torch
from torch import cuda
import random
import os
import torch
from torch import nn
from transformers import Trainer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import datasets
from datasets import Dataset, load_metric
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

print(torch.__version__)
print(transformers.__version__)

2022-04-28 11:52:38.963786: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


1.9.0+cu111
4.17.0


In [None]:
train_path = './nlp-getting-started/train.csv'
train_data = pd.read_csv(train_path)
train_data = train_data[['text', 'target']]
train_data.rename(columns={"target": "label"}, inplace=True)

test_path = './nlp-getting-started/train.csv'
test_data = pd.read_csv(test_path)
test_data = test_data[['text']]
test_data.rename(columns={"target": "label"}, inplace=True)

In [None]:
train_data, dev_data = train_test_split(train_data, test_size=0.1, shuffle=True, stratify=train_data['label'])

print("Train dataset labels count = ", Counter(train_data['label']))
print("Dev dataset labels count = ", Counter(dev_data['label']))
#print("Test dataset labels count = ", Counter(test_data['target'])) #test dataset does not contain the target label

Train dataset labels count =  Counter({0: 3907, 1: 2944})
Dev dataset labels count =  Counter({0: 435, 1: 327})


In [None]:
model_checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2).to('cuda')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

In [None]:
train_data = Dataset.from_pandas(train_data)
dev_data = Dataset.from_pandas(dev_data)
test_data = Dataset.from_pandas(test_data)

encoded_dataset_train = train_data.map(preprocess_function, batched=True)
encoded_dataset_dev = dev_data.map(preprocess_function, batched=True)
encoded_dataset_test = test_data.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/8 [00:00<?, ?ba/s]

In [None]:
columns_to_return = ['input_ids', 'label', 'attention_mask']
columns_to_return_test = ['input_ids', 'attention_mask']
encoded_dataset_train.set_format(columns=columns_to_return)
encoded_dataset_dev.set_format(columns=columns_to_return)
encoded_dataset_test.set_format(columns=columns_to_return_test)

In [None]:
batch_size = 8
metric_name = "f1"
model_name = model_checkpoint.split("/")[-1]
task = 'tweet'

args = TrainingArguments(
    f"./save_model/{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)

In [None]:
metric = load_metric('f1')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
#get_test_predictions
predictions = trainer.predict(encoded_dataset_test)
preds = np.argmax(predictions.predictions, axis=-1)
#print the top 100 examples
for i in range(100):
    print(encoded_dataset_test['text'][i], preds[i], sep='\t')

### Optional: custom class weight

In [None]:
train_labels = encoded_dataset_train['label']
print(np.bincount(train_labels))
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=list(train_labels))
print(class_weights)

[3907 2944]
[0.87675966 1.16355299]


In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        #print(inputs)
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 2 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.Tensor(class_weights).to('cuda'))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = CustomTrainer(
    model,
    args,
    train_dataset=encoded_dataset_train,
    eval_dataset=encoded_dataset_dev,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()