In [1]:
%pip install comet_ml torch datasets transformers scikit-learn

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import comet_ml
from datasets import load_dataset

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding

comet_ml.init(project_name = "Hugging Face Text Classification")

df = load_dataset("imdb")
print(df)

  from .autonotebook import tqdm as notebook_tqdm
2023-08-31 18:47:45.014077: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-31 18:47:45.062828: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [3]:
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"
random_seed = 42

In [4]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

def tokenize_function(data):
    return tokenizer(data["text"], padding="max_length", truncation=True)


tokenize_df = df.map(tokenize_function, batched=True)
train_df = tokenize_df["train"].shuffle(seed=random_seed).select(range(200))
test_df = tokenize_df["test"].shuffle(seed=random_seed).select(range(100))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 25000/25000 [00:05<00:00, 4209.35 examples/s]


In [5]:
model = AutoModelForSequenceClassification.from_pretrained(
    PRE_TRAINED_MODEL_NAME, num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_arguments = TrainingArguments(
    seed=random_seed,
    optim="adamw_torch",
    learning_rate=5e-5,
    num_train_epochs=10,
    output_dir="./results",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_total_limit=10,
    save_steps=25
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [7]:
def compute_metrics(pred):

    #get global experiments
    experiment = comet_ml.get_global_experiment()

    #get y_true and y_preds for eval_dataset
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    #compute precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro')

    #compute accuracy score
    acc = accuracy_score(labels, preds)

    #log confusion matrix
    if experiment:
        epoch = int(experiment.curr_epoch) if experiment.curr_epoch is not None else 0
        experiment.set_epoch(epoch)
        experiment.log_confusion_matrix(
            y_true=labels,
            y_predicted=preds,
            labels=["negative", "postive"]
        )

    return {"accuracy": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall
            }

In [8]:
%env COMET_MODE=ONLINE
%env COMET_LOG_ASSETS=TRUE
trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_df,
    eval_dataset=test_df,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)
trainer.train()

env: COMET_MODE=ONLINE
env: COMET_LOG_ASSETS=TRUE


[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/var/wd_smit/localdata' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/smit-shah-x38/hugging-face-text-classification/cb41b4d1429a46deb138456d6265238f

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.607352,0.7,0.655331,0.819277,0.680851
50,No log,0.482981,0.8,0.799277,0.815626,0.806503
75,No log,0.55636,0.82,0.818182,0.822113,0.816941
100,No log,0.898055,0.8,0.79992,0.80072,0.801686
125,No log,0.806516,0.84,0.838969,0.840404,0.838218
150,No log,0.892161,0.82,0.819712,0.819528,0.820554
175,No log,0.933429,0.82,0.819712,0.819528,0.820554
200,No log,0.952241,0.82,0.819712,0.819528,0.820554
225,No log,0.961955,0.82,0.819712,0.819528,0.820554
250,No log,0.965012,0.82,0.819712,0.819528,0.820554


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/smit-shah-x38/hugging-face-text-classification/cb41b4d1429a46deb138456d6265238f
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     epoch [11]                     : (1.0, 10.0)
[1;38;5;39mCOMET INFO:[0m     eval/accuracy [10]             : (0.7, 0.84)
[1;38;5;39mCOMET INFO:[0m     eval/f1 [10]                   : (0.6553308823529411, 0.8389694041867954)
[1;38;5;39mCOMET INFO:[0m     eval/loss [10]                 : (0.4829811453819275, 0.9650123715400696)
[1;38;5;39mCOMET INFO:[0m     eval

TrainOutput(global_step=250, training_loss=0.11192475128173827, metrics={'train_runtime': 337.6673, 'train_samples_per_second': 5.923, 'train_steps_per_second': 0.74, 'total_flos': 264934797312000.0, 'train_loss': 0.11192475128173827, 'epoch': 10.0})