In [1]:
! pip -q install transformers

In [2]:
import numpy as np
import os
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

In [4]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [5]:
raw = pd.read_csv("c_500_modified.csv")

In [6]:
raw.head()

Unnamed: 0,Label,Comment
0,0,I had an accident with an Uber driver in Mexic...
1,1,I have had my account completely hacked to whe...
2,1,I requested an 8 mile ride in Boston on a Satu...
3,1,Uber is overcharging for Toll fees. When In Fl...
4,1,I had an airport flight today. Uber would not ...


In [7]:
X = list(raw["Comment"])
y = list(raw["Label"])


train_texts, test_texts, train_labels, test_labels = train_test_split(
    X, y, random_state=42, test_size=0.2, stratify=y
)

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
class uberdataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = uberdataset(train_encodings, train_labels)
test_dataset = uberdataset(test_encodings, test_labels)

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
# parameters below based on own my trials 

training_args = TrainingArguments(
    output_dir="results",  # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,  # batch size for evaluation
    warmup_steps=100,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir="logs",  # directory for storing logs
    logging_steps=500,  # default: 500
    save_steps=500,  # default: 500
    learning_rate=1e-5,
    do_train=True,
    do_eval=True,
    seed=16,
    gradient_accumulation_steps=8,  # reduce memory usage while allowing bigger overall batch size.
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # test dataset
)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

In [12]:
%%time

trainer.train()



Step,Training Loss


CPU times: user 13.6 s, sys: 813 ms, total: 14.4 s
Wall time: 12.9 s


TrainOutput(global_step=18, training_loss=0.6992773479885526, metrics={'train_runtime': 12.9138, 'train_samples_per_second': 92.692, 'train_steps_per_second': 1.394, 'total_flos': 152337508454400.0, 'train_loss': 0.6992773479885526, 'epoch': 2.88})

In [13]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[ 0.08487902,  0.04541938],
       [ 0.13084127,  0.03504547],
       [ 0.12499198,  0.04955647],
       [ 0.11130191,  0.00968991],
       [ 0.11830247,  0.0408888 ],
       [ 0.10628816,  0.04370093],
       [ 0.10609939,  0.02141306],
       [ 0.11534595,  0.02725543],
       [ 0.11316184,  0.06991667],
       [ 0.17578147,  0.02623028],
       [ 0.11729523,  0.01353721],
       [ 0.10850637,  0.01462714],
       [ 0.11887817,  0.01741946],
       [ 0.04839353,  0.05308366],
       [ 0.11094422,  0.0463131 ],
       [ 0.11995437,  0.04465192],
       [ 0.11952945,  0.02519054],
       [ 0.13307516,  0.00413141],
       [ 0.14940782,  0.04773632],
       [ 0.09687695,  0.01898511],
       [ 0.05665282,  0.0650939 ],
       [ 0.0924088 ,  0.03123762],
       [ 0.12658139,  0.03294255],
       [ 0.11080057,  0.02597435],
       [ 0.11820336,  0.06262028],
       [ 0.11018533,  0.03901093],
       [ 0.09381311,  0.04376597],
       [ 0.1016248 ,  0.04

In [14]:
trainer.predict(test_dataset).metrics



{'test_loss': 0.6937807202339172,
 'test_accuracy': 0.5,
 'test_f1': 0.07407407407407407,
 'test_precision': 1.0,
 'test_recall': 0.038461538461538464,
 'test_runtime': 0.3809,
 'test_samples_per_second': 262.522,
 'test_steps_per_second': 34.128}

In [15]:
#error_rate = 1.0 - calculations['test_accuracy']
error_rate = 1.0 - trainer.predict(test_dataset).metrics['test_accuracy']
print(f"Error Rate: {error_rate:.4f}")



Error Rate: 0.5000


In [16]:
# predictions

dft=pd.read_csv("project_test.csv")
dft


Unnamed: 0,Label1,Comments1
0,0,Our driver never showed up and Uber cancelled ...
1,1,"My driver, Rohan was nice, but when I tried to..."
2,1,Uber is overcharging for Toll fees. When In Fl...
3,0,I called uber for going home two days ago and ...
4,1,"Ordering a ride, then putting your 10, 12, and..."


In [19]:
dft_comments=list(dft["Comments1"])
dft_comments

['Our driver never showed up and Uber cancelled our ride, not us. They charged us a cancellation fee for them cancelling. When I disputed it they would only credit it on future ride instead of refund back to our credit card. Why am I only getting a credit on account instead of a full refund to my credit card used?? This is not right what you have done when it was a Uber error.',
 "My driver, Rohan was nice, but when I tried to add tip I noticed that he had switched my $9 9am fare in French Quarter to someone else's 2pm trip in Metairie for $28.00! UBER refused to acknowledge. 5 attempts to rectify problem and received nothing but irrelevant automated replies. If you get in the car with Rohan in NOLA and he says he can't get his system to work, it's a a SCAM. Terrible customer service from UBER with zero relevant assistance after 7 days of requesting corrected bill. BEWARE.",
 "Uber is overcharging for Toll fees. When In Florida the Tolls fees estimates between $ .75 to $ 2.00. Uber pri

In [23]:
train_sequences = tokenizer(dft_comments, truncation=True, padding=True)

In [26]:
new_labels=dft.Label1
new_labels

0    0
1    1
2    1
3    0
4    1
Name: Label1, dtype: int64

In [27]:
new_test = uberdataset(train_sequences, new_labels)

In [29]:
trainer.predict(new_test)



  _warn_prf(average, modifier, msg_start, len(result))


PredictionOutput(predictions=array([[0.1252688 , 0.020713  ],
       [0.09799315, 0.00530998],
       [0.1159725 , 0.01782852],
       [0.11458293, 0.05429412],
       [0.18432824, 0.01636701]], dtype=float32), label_ids=array([0, 1, 1, 0, 1]), metrics={'test_loss': 0.7153967618942261, 'test_accuracy': 0.4, 'test_f1': 0.0, 'test_precision': 0.0, 'test_recall': 0.0, 'test_runtime': 0.0219, 'test_samples_per_second': 228.291, 'test_steps_per_second': 45.658})

In [30]:
trainer.predict(new_test).metrics



  _warn_prf(average, modifier, msg_start, len(result))


{'test_loss': 0.7153967618942261,
 'test_accuracy': 0.4,
 'test_f1': 0.0,
 'test_precision': 0.0,
 'test_recall': 0.0,
 'test_runtime': 0.0207,
 'test_samples_per_second': 242.09,
 'test_steps_per_second': 48.418}