# Endian Classifier

## Prerequisites

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers[torch] -U
!pip install datasets -U

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [None]:
import torch

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


## Load Dataset

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd

# Load dataset
dataset = load_dataset("ryfye181/endianness", data_dir="test", split='train')

In [None]:
small_ds = dataset.take(1000)

In [None]:
small_ds

Dataset({
    features: ['data', 'endianness'],
    num_rows: 1000
})

## Load Model and Tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "ryfye181/distilbert_endian_classifier"

In [None]:
# Tokenize dataset

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_dataset(dataset):
    return tokenizer(dataset["data"], padding="max_length", truncation=True)
ds_test_tokenized = small_ds.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name)

config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

## Load pipeline for sequence classification

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)

In [None]:
index = 902
test_data, test_label = small_ds[index]["data"], small_ds[index]["endianness"]
print(f"Data column = {test_data}")
print(f"Label = {test_label}")

Data column = 00 526b 5449 0000 0000 556b 0050 726b 0069 6974 0063 2d5b 2065 616e 656d 5d73 5b20 6f2d 6420 7269 205d  2d5b 2052 616e 656d 205d 2d5b 5b76 5d6e 205d 2d5b 5d56 5b20 772d 6e5b 5d5d 5b20 312d 4361 6344 4766 4967 4c4b 724e 5473 5574 5d78 7320 756f 6372 2d65 6966 656c  000a 0000 704f 6974 6e6f 3a73 200a 2d20 2030 2020 2020 2020 2020 6f66 6d72 7461 7420 6172 736e 616c 6974 6e6f 6f20 7475 7570 2074 6c61 206c 6163 6170 6962 696c  6974 7365 6f20 206e 6e6f 2065 696c 656e 200a 2d20 2031 2020 2020 2020 2020 6f66 6d72 7
Label = big


In [None]:
test_data = small_ds[index]["data"]
classifier(test_data)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[{'label': 'little', 'score': 0.5135465860366821}]