### A few functions taken from "Natural Language Processing with Transformers: Building Language Applications with Hugging Face"

This notebook demonstrate how to use transformers with fine tuning. I used only the basic distilbert model, without any special care and it done well.

In [None]:
import numpy as np
import pandas as pd
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
X = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv", index_col=0)
# target should be renamed to label for transformers to recognize it as its label
X = X.rename(columns={"target": "label"})
X_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv", index_col=0)
# test needs label to have the same structure as train
X_test["label"] = 0
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv", index_col=0)

print(X.head())
print(X_test.head())
print(submission.head())

In [None]:
# simple validation
val_pct = 0.3
train_test_roll = np.random.uniform(size=X.shape[0])
X_train = X[train_test_roll >= val_pct]
X_val = X[train_test_roll < val_pct]

In [None]:
# Prepare data for ingestion
from datasets import Dataset, DatasetDict

dataset_train = Dataset.from_pandas(X_train[["text", "label"]])
dataset_val = Dataset.from_pandas(X_val[["text", "label"]])
dataset_test = Dataset.from_pandas(X_test[["text", "label"]])
dataset = DatasetDict({"train": dataset_train, "val": dataset_val, "test": dataset_test})
dataset

In [None]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
# Tokenize
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)


In [None]:
dataset_encoded

In [None]:
# Get latest input stage
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
# Preparing code to run in model
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 2
# Ready model for finetuning for classification
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-disaster"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=4,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  report_to="none",
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level="error")

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["val"],
                  tokenizer=tokenizer)
trainer.train();

In [None]:
dataset_train_full = Dataset.from_pandas(X[["text", "label"]])
dataset_test = Dataset.from_pandas(X_test[["text", "label"]])
dataset = DatasetDict({"train": dataset_train_full, "test": dataset_test})
dataset

In [None]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)


In [None]:
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

In [None]:
model_name = f"{model_ckpt}-finetuned-disaster_full"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=4,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  disable_tqdm=False,
                                  report_to="none",
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level="error")

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  tokenizer=tokenizer)
trainer.train();

In [None]:
preds_output = trainer.predict(dataset_encoded["test"])
preds_output

In [None]:
submission["target"] = (preds_output.predictions[:, 1] > 0).astype(int)
submission

In [None]:
submission.to_csv("bert_finetuning.csv")