<a href="https://colab.research.google.com/github/robinborth/ardupilot/blob/main/classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!git clone https://github.com/robinborth/ardupilot.git
%cd ardupilot

fatal: destination path 'ardupilot' already exists and is not an empty directory.
/content/ardupilot


In [21]:
import pandas as pd
import json

df = pd.read_csv('output.csv')
df.rename(columns={'sentence': 'text'}, inplace=True)
df['label'] = df['pairs_i'].apply(lambda x: int(bool(json.loads(x))))
df = df.query("is_ardu == False").reset_index(drop=True)
df = df[['text', 'label']]

In [22]:
from sklearn.model_selection import train_test_split
train, validation = train_test_split(df, train_size=0.8, random_state=0)

In [23]:
from datasets import Dataset, DatasetDict

ardupilot = DatasetDict()
ardupilot['train'] = Dataset.from_pandas(train, preserve_index=False)
ardupilot['validation'] = Dataset.from_pandas(validation, preserve_index=False)

In [6]:
from transformers import AutoTokenizer

model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [24]:
def tokenize(batch):
  return tokenizer(batch['text'], padding=True, truncation=True)

ardupilot_encoded = ardupilot.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
ardupilot_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 375
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 94
    })
})

In [9]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
from transformers import AutoModelForSequenceClassification

num_labels = 2
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [11]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accurracy": acc, "f1": f1}

In [26]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(ardupilot_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-ardupilot"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=20,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False, 
                                  log_level="error")

In [27]:
trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=ardupilot_encoded["train"],
                  eval_dataset=ardupilot_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train()



Epoch,Training Loss,Validation Loss,Accurracy,F1
1,0.0758,0.126368,0.95,0.949875
2,0.0568,0.249398,0.85,0.846547
3,0.0337,0.317046,0.85,0.846547
4,0.0303,0.26207,0.85,0.846547
5,0.0256,0.174616,0.95,0.949875
6,0.0202,0.136345,0.95,0.949875
7,0.0177,0.130772,0.95,0.949875
8,0.0135,0.149415,0.95,0.949875
9,0.013,0.177898,0.95,0.949875
10,0.0111,0.205823,0.95,0.949875


TrainOutput(global_step=40, training_loss=0.02185290912166238, metrics={'train_runtime': 5.2074, 'train_samples_per_second': 307.255, 'train_steps_per_second': 7.681, 'total_flos': 19042188556800.0, 'train_loss': 0.02185290912166238, 'epoch': 20.0})

In [20]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train['text'], train['label'])
dummy_clf.score(validation['text'], validation['label'])

0.6276595744680851