# Distilbert Endianness Classifier Training

## Setup

In [None]:
%pip install transformers[torch] -U
%pip install accelerate -U
%pip install datasets
%pip install evaluate

Collecting datasets
  Using cached datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Using cached datasets-3.0.0-py3-none-any.whl (474 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134

Check Device

In [None]:
import torch

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


## Load Dataset and prepare data

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd

In [None]:
# Load dataset
dataset = load_dataset("ryfye181/endianness")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

shard_0.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

shard_1.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

shard_10.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

shard_11.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

shard_12.parquet:   0%|          | 0.00/108M [00:00<?, ?B/s]

shard_13.parquet:   0%|          | 0.00/82.7M [00:00<?, ?B/s]

shard_2.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

shard_3.parquet:   0%|          | 0.00/96.0M [00:00<?, ?B/s]

shard_4.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

shard_5.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

shard_6.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

shard_7.parquet:   0%|          | 0.00/97.5M [00:00<?, ?B/s]

shard_8.parquet:   0%|          | 0.00/100M [00:00<?, ?B/s]

shard_9.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

shard_0.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

shard_1.parquet:   0%|          | 0.00/104M [00:00<?, ?B/s]

shard_2.parquet:   0%|          | 0.00/4.32M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7051520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1049246 [00:00<?, ? examples/s]

In [None]:
# Split to test and train
ds_train = dataset["train"]
ds_test = dataset["test"]

In [None]:
# Rename columns for model input and
# Give number values to labels

def preprocess(examples):
    return {
        'text': examples['data'],
        'labels': [0 if label == "little" else 1 for label in examples['endianness']]
    }

ds_train = ds_train.map(preprocess, batched=True, remove_columns=['data', 'endianness'])
ds_test = ds_test.map(preprocess, batched=True, remove_columns=['data', 'endianness'])

Map:   0%|          | 0/7051520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1049246 [00:00<?, ? examples/s]

In [None]:
# Shuffle Dataset
ds_train = ds_train.shuffle(seed=42)
ds_test = ds_test.shuffle(seed=42)

## Load tokenizer and tokenize data

In [None]:
model_name = "distilbert/distilbert-base-uncased"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Define the tokenization function
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

# Use larger batch size and multiprocessing
ds_train_tokenized = ds_train.map(tokenize_dataset, batched=True, num_proc=4, batch_size=64)
ds_test_tokenized = ds_test.map(tokenize_dataset, batched=True, num_proc=4, batch_size=64)

Map (num_proc=4):   0%|          | 0/7051520 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1049246 [00:00<?, ? examples/s]

## Load model

In [None]:
from transformers import AutoModelForSequenceClassification

# id2label dictionary
id2label = {0: "little", 1: "big"}
label2id = {"little": 0, "big": 1}

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Set Training Arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="EndianClassifier/saved_model",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,

    # Saving configuration
    save_strategy="steps",
    save_steps=10000,
    save_total_limit=2, # Keep only the 2 most recent checkpoints

    evaluation_strategy="steps",
    eval_steps=1000,

    # Load the best model at the end of training
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)



## Set Data Collator

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Set Metrics

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  result = metric.compute(predictions=predictions, references=labels)
  return result


## Set Trainer

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train_tokenized,
    eval_dataset=ds_test_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Train Model

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
1000,0.3347,0.332731,0.7921
2000,0.3224,0.331198,0.762224
3000,0.3148,0.324842,0.76787
4000,0.3062,0.316083,0.767834
5000,0.3017,0.316431,0.770814


KeyboardInterrupt: 

## Save Model

In [None]:
model.save_pretrained = "EndianClassifier/saved_model"