In [1]:
from datasets import load_dataset
import numpy as np

# imdb = load_dataset("imdb")

## Dataset

### SNIPS

In [32]:
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from urllib.request import urlretrieve
from datasets import Dataset,DatasetDict
import pandas as pd

SNIPS_DATA_BASE_URL = (
    "https://github.com/ogrisel/slot_filling_and_intent_detection_of_SLU/blob/"
    "master/data/snips/"
)
for filename in ["train", "valid", "test", "vocab.intent", "vocab.slot"]:
    path = Path(filename)
    if not path.exists():
        print(f"Downloading {filename}...")
        urlretrieve(SNIPS_DATA_BASE_URL + filename + "?raw=true", path)
        
lines_train = Path("train").read_text("utf-8").strip().splitlines()
#lines_train[:5]

def parse_line(line):
    utterance_data, intent_label = line.split(" <=> ")
    items = utterance_data.split()
    words = [item.rsplit(":", 1)[0]for item in items]
    word_labels = [item.rsplit(":", 1)[1]for item in items]
    return {
        "label": intent_label,
        "text": " ".join(words),
    }

parsed = [parse_line(line) for line in lines_train]

df_train = pd.DataFrame([p for p in parsed if p is not None])

lines_valid = Path("valid").read_text("utf-8").strip().splitlines()
lines_test = Path("test").read_text("utf-8").strip().splitlines()

df_valid = pd.DataFrame([parse_line(line) for line in lines_valid])
df_test = pd.DataFrame([parse_line(line) for line in lines_test])

le = LabelEncoder()
df_train.label = le.fit_transform(df_train.label)
df_test.label = le.fit_transform(df_test.label)
df_valid.label = le.fit_transform(df_valid.label)
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

id_mapping = {v: k for k, v in label_mapping.items()}

dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)
dataset_valid = Dataset.from_pandas(df_valid)


all_dataset = DatasetDict({"train":dataset_train, "test":dataset_test , "validation":dataset_valid})




### CLINC

In [3]:
# loading clinc dataset
all_dataset = load_dataset("clinc_oos", "small")
all_dataset = all_dataset.rename_column("intent", "label")

# labels = clinc["train"].features["label"].names
# label2id = {labels[i] : i for i in range(len(labels))}
# id2label = {i: labels[i]  for i in range(len(labels))}

Found cached dataset clinc_oos (/work/pi_adrozdov_umass_edu/syerawar_umass_edu/hf_cache/datasets/clinc_oos/small/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
all_dataset["test"][100]

{'text': 'what does assiduous mean', 'label': 139}

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
tokenized_imdb = all_dataset.map(preprocess_function,batched= True)

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

Map:   0%|          | 0/5500 [00:00<?, ? examples/s]

In [9]:
tokenized_imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 7600
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5500
    })
})

In [10]:
print("Done")

Done


In [13]:
# tokenized_imdb["train"][1]
tokenized_imdb = tokenized_imdb.remove_columns(["text"])
tokenized_imdb = tokenized_imdb.rename_column("label", "labels")
tokenized_imdb.set_format("torch")

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
import evaluate

accuracy = evaluate.load("accuracy")

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
type(all_dataset["train"].features['label'])

datasets.features.features.ClassLabel

In [15]:
from datasets import ClassLabel, Sequence

label_names = sorted(set(labels for labels in all_dataset["train"]["label"]))
# label_names
# Cast to ClassLabel
# all_dataset = all_dataset.cast_column("label", Sequence(ClassLabel(names=label_names)))


# labels = all_dataset["train"].features["label"].names
label2id = {"Label_"+str(i) : i for i in range(len(label_names))}
id2label = {i: "Label_"+str(i)  for i in range(len(label_names))}

In [16]:
# id2label = {0: "NEGATIVE", 1: "POSITIVE"}
# label2id = {"NEGATIVE": 0, "POSITIVE": 1}

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=len(label_names), id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [17]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [18]:
from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')

In [53]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


In [19]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_imdb["train"].shuffle(seed=42),shuffle = True,batch_size = 16)
test_loader = DataLoader(tokenized_imdb["test"].shuffle(seed=42),shuffle = True,batch_size = 16)

In [20]:
import torch
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

optimizer = AdamW(model.parameters(),lr = 5e-5)
num_epochs = 3
num_steps = num_epochs*len(train_loader)
lr_scheduler = get_scheduler(name="linear",optimizer = optimizer, num_warmup_steps = 0,num_training_steps = num_steps)

In [21]:
model.device

device(type='cuda', index=0)

In [15]:
progress_bar = tqdm(range(num_steps))
model.train()

# print(len(train_loader))
for epoch in range(num_epochs):
    for batch in train_loader:
        # print(batch)
        # print(type(batch['input_ids']))
        # for k,v in batch.items():
        #     print(v)
        #     break
        bt = {k:torch.Tensor(v).to(device) for k,v in batch.items()}
        out = model(**bt)
        loss = out.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 4689/4689 [57:16<00:00,  1.59it/s]

In [20]:
metric = evaluate.load("accuracy")
model.eval()
for batch in test_loader:
    bt = {k:torch.Tensor(v).to(device) for k,v in batch.items()}
    with torch.no_grad():
        out = model(**bt)
        
    logits = out.logits
    predictions = torch.argmax(logits,dim=-1)
    metric.add_batch(predictions = predictions,references = batch["labels"])
metric.compute()

{'accuracy': 0.93396}

In [28]:
text = "This was fine."

inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    logits = model(**inputs).logits
    
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

In [29]:
model.save_pretrained("./bert_imdb_3_1e5.pt")