In [1]:
! pip -q install transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import os
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
raw = pd.read_csv("/content/sample_data/C500.csv")

In [6]:
raw.head()

Unnamed: 0,Fair/Unfair,Label,Date,Comment,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Fair,0,10/29/2018,I had an accident with an Uber driver in Mexic...,,,
1,Unfair,1,10/28/2018,I have had my account completely hacked to whe...,,,
2,Unfair,1,10/27/2018,I requested an 8 mile ride in Boston on a Satu...,,,
3,Unfair,1,10/25/2018,Uber is overcharging for Toll fees. When In Fl...,,,
4,Unfair,1,10/24/2018,I had an airport flight today. Uber would not ...,,,


In [7]:
X = list(raw["Comment"])
y = list(raw["Label"])


train_texts, test_texts, train_labels, test_labels = train_test_split(
    X, y, random_state=42, test_size=0.2, stratify=y
)

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
class uberdataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = uberdataset(train_encodings, train_labels)
test_dataset = uberdataset(test_encodings, test_labels)

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
# parameters below based on own my trials 

training_args = TrainingArguments(
    output_dir="results",  # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,  # batch size for evaluation
    warmup_steps=100,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir="logs",  # directory for storing logs
    logging_steps=500,  # default: 500
    save_steps=500,  # default: 500
    learning_rate=1e-5,
    do_train=True,
    do_eval=True,
    seed=16,
    gradient_accumulation_steps=8,  # reduce memory usage while allowing bigger overall batch size.
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # test dataset
)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [12]:
%%time

trainer.train()



Step,Training Loss


CPU times: user 45min 25s, sys: 44 s, total: 46min 9s
Wall time: 46min 45s


TrainOutput(global_step=36, training_loss=0.6916143099466959, metrics={'train_runtime': 2805.2283, 'train_samples_per_second': 0.415, 'train_steps_per_second': 0.013, 'total_flos': 123989485142016.0, 'train_loss': 0.6916143099466959, 'epoch': 2.97})

In [13]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[ 0.00393603,  0.06319065],
       [ 0.02344913,  0.07912374],
       [-0.02410943,  0.09780347],
       [ 0.04059775,  0.01509743],
       [ 0.08062731,  0.01486654],
       [ 0.12578207,  0.02691425],
       [ 0.01279336,  0.06715517],
       [ 0.05664479, -0.0045085 ],
       [ 0.03601463,  0.08402047],
       [ 0.05779634,  0.03970193],
       [-0.02023201,  0.06927618],
       [ 0.03956794,  0.06184826],
       [ 0.02396086,  0.05792006],
       [ 0.059096  ,  0.06358361],
       [ 0.01751234,  0.08377938],
       [ 0.01826943,  0.02430863],
       [ 0.02855199,  0.0354025 ],
       [ 0.02058886,  0.03118461],
       [ 0.0304328 ,  0.05685468],
       [ 0.02816382, -0.00104594],
       [ 0.11586766,  0.03082512],
       [-0.01673704,  0.06961372],
       [ 0.01274513,  0.09578863],
       [-0.01047415,  0.10806223],
       [ 0.00953337,  0.0558719 ],
       [-0.00818567,  0.08643514],
       [ 0.04159279, -0.00866919],
       [-0.01371717,  0.05

In [14]:
trainer.predict(test_dataset).metrics

{'test_loss': 0.6811155080795288,
 'test_accuracy': 0.6391752577319587,
 'test_f1': 0.7058823529411766,
 'test_precision': 0.6086956521739131,
 'test_recall': 0.84,
 'test_runtime': 110.6628,
 'test_samples_per_second': 0.877,
 'test_steps_per_second': 0.226}

In [18]:
#error_rate = 1.0 - calculations['test_accuracy']
error_rate = 1.0 - trainer.predict(test_dataset).metrics['test_accuracy']
print(f"Error Rate: {error_rate:.4f}")

Error Rate: 0.3608
