In [1]:
!pip install datasets
!pip install accelerate -U
!pip install pip install transformers[torch]


Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets

In [2]:
from datasets import load_dataset, load_metric
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np


In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
url = 'https://docs.google.com/spreadsheets/d/1NW4dkPRsOzBdnOMMALy5So8WsS6nef5EK6SYRSNFDGk/export?format=csv'

ds = load_dataset('csv', data_files=url)
ds = ds["train"].train_test_split(test_size=0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 7999
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [24]:
model_name = 'distilbert-base-uncased'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model_tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def tokenize(examples):
  outputs = model_tokenizer(examples["text"], truncation=True, padding=True)
  return outputs

tokenizer_ds = ds.map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(model_tokenizer)

Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [28]:
def compute_metrics(eval_preds):
  metric = load_metric("accuracy")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

path = F"/content/drive/My Drive/distilber-dana-mini"
training_args = TrainingArguments(num_train_epochs=1,
                                  output_dir=path,
                                  push_to_hub=False,
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  learning_rate=3e-4,
                                  evaluation_strategy="epoch")

trainer = Trainer(model=model, tokenizer=model_tokenizer,
                  data_collator= data_collator,
                  args=training_args,
                  train_dataset = tokenizer_ds["train"],
                  eval_dataset = tokenizer_ds["test"],
                  compute_metrics=compute_metrics)



In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.240411,0.897


  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

TrainOutput(global_step=250, training_loss=0.323573974609375, metrics={'train_runtime': 162.5845, 'train_samples_per_second': 49.199, 'train_steps_per_second': 1.538, 'total_flos': 471425599277904.0, 'train_loss': 0.323573974609375, 'epoch': 1.0})

In [30]:
trainer.save_model()