In [27]:
import comet_ml
from comet_ml import Experiment
import torch
import random

In [25]:
# Building an experiment with your API key
experiment = Experiment(
    api_key="IkRq4zmkwF7SO5EiZAG4UjEVQ",
    project_name="CLINC",
    workspace="gdhanania",
)

# Setting hyperparameters
hyper_params = {
    "seed" : 34,
    "learning_rate" : 2e-5,
    "per_device_train_batch_size" : 5,
    "per_device_eval_batch_size" : 5,
    "num_train_epochs" : 10,
    "weight_decay" : 0.1,
    "dataset_subset" : "small"}

# Logging hyperparamters
experiment.log_parameters(hyper_params)

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.com/gdhanania/clinc/98101069246a41598441c13cef34a203
COMET INFO:   Parameters:
COMET INFO:     dataset_subset              : small
COMET INFO:     learning_rate               : 2e-05
COMET INFO:     num_train_epochs            : 10
COMET INFO:     per_device_eval_batch_size  : 5
COMET INFO:     per_device_train_batch_size : 5
COMET INFO:     seed                        : foo
COMET INFO:     weight_decay                : 0.1
COMET INFO:   Uploads:
COMET INFO:     conda-environment-definition : 1
COMET INFO:     conda-info                   : 1
COMET INFO:     conda-specification          : 1
COMET INFO:     environment details          : 1
COMET INFO:     filename                     : 1
COMET INFO:     git metadata                 : 1
COMET INFO:    

In [29]:
torch.manual_seed(hyper_params['seed'])
random.seed(hyper_params['seed'])

In [30]:
# NOTE: Based on your requirements, make changes to the variables:  checkpoints_out_dir, dataset_subset
# model and checkpoints_out_dir directory
L_Model = "roberta-base"
checkpoints_out_dir = "../checkpoints/clinc_small"
# dataset
dataset_name = 'clinc_oos'
dataset_subset = hyper_params['dataset_subset']
# device
device = 'cuda:0'

In [31]:
# load train and validation dataset
from datasets import load_dataset
dataset = load_dataset(dataset_name, dataset_subset)
print(dataset)
dataset = dataset.rename_column("intent", "label")
train_data, valid_data = dataset['train'], dataset['validation']

Found cached dataset clinc_oos (/work/pi_adrozdov_umass_edu/gdhanania_umass_edu/hf_cache/datasets/clinc_oos/small/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1)
100%|██████████| 3/3 [00:00<00:00, 153.82it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 7600
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 5500
    })
})





In [32]:
# generating labels

labels = train_data.features["label"].names
label2id = {labels[i] : i for i in range(len(labels))}
id2label = {i: labels[i]  for i in range(len(labels))}

In [33]:
# preprocessing: 
# convert text --> ids
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [34]:
# Initialise tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(L_Model)

In [35]:
# Tokenize train and validation dataset
train_data = train_data.map(preprocess_function, batched=True)
valid_data = valid_data.map(preprocess_function, batched=True)

Loading cached processed dataset at /work/pi_adrozdov_umass_edu/gdhanania_umass_edu/hf_cache/datasets/clinc_oos/small/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1/cache-2e95d7b0c2fff21f.arrow
                                                                  

In [36]:
# data collator to form a batch from list of training dataset
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [37]:
# Evaluate metrics
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis= 1)
    return accuracy.compute(predictions=predictions, references=labels)

In [38]:
# Define model
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    L_Model, num_labels=len(labels), id2label=id2label, label2id=label2id, return_dict=True)

# load the model into GPU
model = model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

In [39]:
model.device

device(type='cuda', index=0)

In [40]:
# define training arguments
training_args = TrainingArguments(
    output_dir=checkpoints_out_dir,
    learning_rate=hyper_params['learning_rate'],
    per_device_train_batch_size=hyper_params['per_device_train_batch_size'],
    per_device_eval_batch_size=hyper_params['per_device_eval_batch_size'],
    num_train_epochs=hyper_params['num_train_epochs'],
    weight_decay=hyper_params['weight_decay'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# define trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = valid_data,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

# Train model
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.8833,1.274216,0.833226
2,0.4166,0.380624,0.932903
3,0.1039,0.296894,0.943226
4,0.0482,0.31518,0.948065
5,0.0254,0.305236,0.952258
6,0.0253,0.338674,0.948065
7,0.0123,0.318732,0.953226
8,0.0068,0.326316,0.954194
9,0.0033,0.334304,0.952581
10,0.005,0.322711,0.955806


TrainOutput(global_step=15200, training_loss=0.40196096307941176, metrics={'train_runtime': 1513.6347, 'train_samples_per_second': 50.21, 'train_steps_per_second': 10.042, 'total_flos': 611751420952770.0, 'train_loss': 0.40196096307941176, 'epoch': 10.0})

In [42]:
experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.com/gdhanania/clinc/fc46937cf28d496386168027db55d709
COMET INFO:   Parameters:
COMET INFO:     dataset_subset              : small
COMET INFO:     learning_rate               : 2e-05
COMET INFO:     num_train_epochs            : 10
COMET INFO:     per_device_eval_batch_size  : 5
COMET INFO:     per_device_train_batch_size : 5
COMET INFO:     seed                        : 34
COMET INFO:     weight_decay                : 0.1
COMET INFO:   Uploads:
COMET INFO:     conda-environment-definition : 1
COMET INFO:     conda-info                   : 1
COMET INFO:     conda-specification          : 1
COMET INFO:     environment details          : 1
COMET INFO:     filename                     : 1
COMET INFO:     git metadata                 : 1
COMET INFO:     