In [1]:
!pip install accelerate -U
!pip install transformers[torch]



In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
checkpoint = "microsoft/xtremedistil-l6-h256-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
import torch, torch.nn as nn
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [5]:
import pandas as pd

data_file_path = "/content/drive/MyDrive/Colab Notebooks/sentiment-ONNX/datasets/polls_sentiments.csv"
data_frame = pd.read_csv(data_file_path)
print(data_frame.head())

                             Visitor ID                 Date  \
0  002ffa2f-3b44-4d73-8f1f-482b36619e4e  2023-10-12 16:01:04   
1  0036d1e6-f7e1-4d37-a679-d1430083c0d8  2023-04-18 16:30:05   
2  0039c9c5-b962-472c-a611-12c68e71abe3  2023-05-04 11:08:30   
3  003aad10-4968-4df3-977a-e70d3dc292be  2023-06-19 16:19:57   
4  00436950-a784-423b-ab6b-7208b48a8ea7  2023-10-11 15:00:17   

                                            Response  \
0                          The ID portion was a pain   
1  I was pleasantly surprised. It was easy and we...   
2                       super easy - very nice agent   
3  I can't think of anything that you have to imp...   
4  Everything went well especially with pre-signi...   

  Most recent operating system (All Apps) Sentiment  Positive Score  \
0                                Mac OS X  NEGATIVE         0.03122   
1                                 Windows  POSITIVE         0.95270   
2                                 Windows  POSITIVE         0.950

In [6]:
sentiment_dict = {'NEGATIVE': 0, 'negative': 0, 'POSITIVE': 1, 'positive': 1}

data_frame = data_frame.dropna(subset=['Sentiment'])

data_frame['Sentiment'] = data_frame['Sentiment'].apply(lambda x: sentiment_dict[x])

data_frame[:10]

Unnamed: 0,Visitor ID,Date,Response,Most recent operating system (All Apps),Sentiment,Positive Score,Negative Score
0,002ffa2f-3b44-4d73-8f1f-482b36619e4e,2023-10-12 16:01:04,The ID portion was a pain,Mac OS X,0,0.03122,0.97935
1,0036d1e6-f7e1-4d37-a679-d1430083c0d8,2023-04-18 16:30:05,I was pleasantly surprised. It was easy and we...,Windows,0,0.9527,0.02145
2,0039c9c5-b962-472c-a611-12c68e71abe3,2023-05-04 11:08:30,super easy - very nice agent,Windows,0,0.95098,0.022
3,003aad10-4968-4df3-977a-e70d3dc292be,2023-06-19 16:19:57,I can't think of anything that you have to imp...,Windows,0,0.86841,0.07014
4,00436950-a784-423b-ab6b-7208b48a8ea7,2023-10-11 15:00:17,Everything went well especially with pre-signi...,Windows,0,0.95083,0.02201
5,00507fcc-7d56-49c5-8e96-b8dca6220a82,2023-06-05 19:22:53,well prepared,Windows,0,0.95025,0.02224
6,00597eab-d751-41c1-ad2e-b95b17213f92,2023-05-08 10:35:10,Give better notification/communication before ...,Mac OS X,0,0.09478,0.95567
7,0066d60a-09e5-49be-aa2f-f6bc4d908470,2023-04-10 21:51:05,Just not. Familiar with Chrome as we use diffe...,iOS,0,0.02781,0.9807
8,00690cb1-14aa-481d-bcb0-880261e8ec79,2022-11-18 18:18:30,Thank you,Mac OS X,0,0.94889,0.02275
9,0072da4f-07be-4c96-ad16-eacb9654a53b,2023-03-23 14:04:59,i was not able to sign in,Mac OS X,0,0.02589,0.98122


In [7]:
!pip install datasets



In [8]:
import datasets
raw_datasets = datasets.Dataset.from_pandas(data_frame)
raw_datasets

Dataset({
    features: ['Visitor ID', 'Date', 'Response', 'Most recent operating system (All Apps)', 'Sentiment', 'Positive Score', 'Negative Score', '__index_level_0__'],
    num_rows: 5520
})

In [9]:
def tokenize_function(examples):
  return {
      'input_ids': tokenizer(examples["Response"], truncation=True)["input_ids"],
      'labels': examples["Sentiment"]
      }

In [10]:
from transformers import DataCollatorWithPadding

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer)

tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

tokenized_datasets

Map:   0%|          | 0/5520 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


DatasetDict({
    train: Dataset({
        features: ['Visitor ID', 'Date', 'Response', 'Most recent operating system (All Apps)', 'Sentiment', 'Positive Score', 'Negative Score', '__index_level_0__', 'input_ids', 'labels'],
        num_rows: 4416
    })
    test: Dataset({
        features: ['Visitor ID', 'Date', 'Response', 'Most recent operating system (All Apps)', 'Sentiment', 'Positive Score', 'Negative Score', '__index_level_0__', 'input_ids', 'labels'],
        num_rows: 1104
    })
})

In [11]:
tokenized_datasets['train'][0]

{'Visitor ID': '125de30f-2fec-4caa-b67e-6fa31f6befc3',
 'Date': '2022-12-03 10:52:07',
 'Response': 'Very easy to use!',
 'Most recent operating system (All Apps)': 'Windows',
 'Sentiment': 0,
 'Positive Score': 0.95056,
 'Negative Score': 0.02214,
 '__index_level_0__': 383,
 'input_ids': [101, 2200, 3733, 2000, 2224, 999, 102],
 'labels': 0}

In [12]:
def compute_metrics(eval_preds):
  metric = datasets.load_metric("accuracy")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments("test-trainer",
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  )

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()

Step,Training Loss
500,0.1388
1000,0.0077
1500,0.0027
2000,0.0015
2500,0.0009
3000,0.0006
3500,0.0005
4000,0.0004
4500,0.0003
5000,0.0003


TrainOutput(global_step=5520, training_loss=0.013947973193506292, metrics={'train_runtime': 175.1792, 'train_samples_per_second': 252.085, 'train_steps_per_second': 31.511, 'total_flos': 39110164238496.0, 'train_loss': 0.013947973193506292, 'epoch': 10.0})

In [15]:
!pip install numpy




In [16]:
import numpy as np
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

metric = datasets.load_metric("accuracy")
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=predictions.label_ids)

  metric = datasets.load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

(1104, 2) (1104,)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 1.0}

In [17]:
trainer.evaluate(tokenized_datasets["test"])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.00022372820239979774,
 'eval_accuracy': 1.0,
 'eval_runtime': 2.2743,
 'eval_samples_per_second': 485.426,
 'eval_steps_per_second': 60.678,
 'epoch': 10.0}

In [18]:
tuned_model_path = "/content/drive/MyDrive/Colab Notebooks/sentiment-ONNX/tuned_xtreme_model"

model.save_pretrained(tuned_model_path)
tokenizer.save_pretrained(tuned_model_path)

('/content/drive/MyDrive/Colab Notebooks/sentiment-ONNX/tuned_xtreme_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/sentiment-ONNX/tuned_xtreme_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/sentiment-ONNX/tuned_xtreme_model/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/sentiment-ONNX/tuned_xtreme_model/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/sentiment-ONNX/tuned_xtreme_model/tokenizer.json')