In [87]:
! pip -q install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [88]:
!pip -q install tensorflow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [89]:
import numpy as np
import os
import pandas as pd
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

In [90]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [91]:
raw = pd.read_csv("c_final - Sheet1.csv")

In [92]:
raw.head()

Unnamed: 0,Label,Comment
0,0.0,I had an accident with an Uber driver in Mexic...
1,1.0,I have had my account completely hacked to whe...
2,1.0,I requested an 8 mile ride in Boston on a Satu...
3,1.0,Uber is overcharging for Toll fees. When In Fl...
4,1.0,I had an airport flight today. Uber would not ...


In [93]:
raw[raw["Label"].isnull()]

Unnamed: 0,Label,Comment
706,,
707,,
708,,
709,,
710,,
711,,
712,,
713,,
714,,
715,,


In [94]:
raw=raw.dropna()
raw

Unnamed: 0,Label,Comment
0,0.0,I had an accident with an Uber driver in Mexic...
1,1.0,I have had my account completely hacked to whe...
2,1.0,I requested an 8 mile ride in Boston on a Satu...
3,1.0,Uber is overcharging for Toll fees. When In Fl...
4,1.0,I had an airport flight today. Uber would not ...
...,...,...
1698,1.0,I tried to log in to Uber. I have only one ema...
1699,1.0,When I saw an ad online for the soon to be Ube...
1700,1.0,As a driver I was assaulted by a couple guys. ...
1701,1.0,I attempted to sign up as a driver about a mon...


In [95]:
raw["Label"]=raw["Label"].astype(int)

In [96]:
X = list(raw["Comment"])
y = list(raw["Label"])


train_texts, test_texts, train_labels, test_labels = train_test_split(
    X, y, random_state=42, test_size=0.2, stratify=y
)

In [97]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [98]:
class uberdataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = uberdataset(train_encodings, train_labels)
test_dataset = uberdataset(test_encodings, test_labels)

In [99]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [100]:
# parameters below based on own my trials 

training_args = TrainingArguments(
    output_dir="results",  # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=4,  # batch size for evaluation
    warmup_steps=100,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir="logs",  # directory for storing logs
    logging_steps=500,  # default: 500
    save_steps=500,  # default: 500
    learning_rate=1e-5,
    do_train=True,
    do_eval=True,
    seed=16,
    gradient_accumulation_steps=8,  # reduce memory usage while allowing bigger overall batch size.
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")



trainer = Trainer(
    model=model,  # the instantiated Transformers model to be trained
    args=training_args,  # training arguments, defined above
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,  # training dataset
    eval_dataset=test_dataset,  # test dataset
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

In [101]:
%%time

trainer.train()



Step,Training Loss


CPU times: user 53.5 s, sys: 780 ms, total: 54.3 s
Wall time: 48.8 s


TrainOutput(global_step=63, training_loss=0.6884864928230406, metrics={'train_runtime': 48.8192, 'train_samples_per_second': 83.021, 'train_steps_per_second': 1.29, 'total_flos': 533843616583680.0, 'train_loss': 0.6884864928230406, 'epoch': 2.98})

In [102]:
predicted=trainer.predict(test_dataset)

In [103]:
trainer.predict(test_dataset).metrics



{'test_loss': 0.6717835664749146,
 'test_accuracy': 0.6479289940828402,
 'test_f1': 0.6020066889632106,
 'test_precision': 0.6521739130434783,
 'test_recall': 0.5590062111801242,
 'test_runtime': 1.7508,
 'test_samples_per_second': 193.056,
 'test_steps_per_second': 24.56}

In [104]:
#error_rate = 1.0 - calculations['test_accuracy']
error_rate = 1.0 - trainer.predict(test_dataset).metrics['test_accuracy']
print(f"Error Rate: {error_rate:.4f}")



Error Rate: 0.3521


In [105]:
# predictions

dft=pd.read_csv("project_test.csv")
dft


Unnamed: 0,Label1,Comments1
0,0,Our driver never showed up and Uber cancelled ...
1,1,"My driver, Rohan was nice, but when I tried to..."
2,1,Uber is overcharging for Toll fees. When In Fl...
3,0,I called uber for going home two days ago and ...
4,1,"Ordering a ride, then putting your 10, 12, and..."


In [106]:
dft_comments=list(dft["Comments1"])
dft_comments

['Our driver never showed up and Uber cancelled our ride, not us. They charged us a cancellation fee for them cancelling. When I disputed it they would only credit it on future ride instead of refund back to our credit card. Why am I only getting a credit on account instead of a full refund to my credit card used?? This is not right what you have done when it was a Uber error.',
 "My driver, Rohan was nice, but when I tried to add tip I noticed that he had switched my $9 9am fare in French Quarter to someone else's 2pm trip in Metairie for $28.00! UBER refused to acknowledge. 5 attempts to rectify problem and received nothing but irrelevant automated replies. If you get in the car with Rohan in NOLA and he says he can't get his system to work, it's a a SCAM. Terrible customer service from UBER with zero relevant assistance after 7 days of requesting corrected bill. BEWARE.",
 "Uber is overcharging for Toll fees. When In Florida the Tolls fees estimates between $ .75 to $ 2.00. Uber pri

In [107]:
train_sequences = tokenizer(dft_comments, truncation=True, padding=True)

In [108]:
new_labels=dft.Label1
new_labels

0    0
1    1
2    1
3    0
4    1
Name: Label1, dtype: int64

In [109]:
new_test = uberdataset(train_sequences, new_labels)

In [110]:
trainer.predict(new_test)



PredictionOutput(predictions=array([[-0.00071123,  0.06836964],
       [ 0.03975232,  0.01989567],
       [-0.01391758,  0.10309046],
       [-0.02537625,  0.13009623],
       [ 0.09344252, -0.12840906]], dtype=float32), label_ids=array([0, 1, 1, 0, 1]), metrics={'test_loss': 0.7406556606292725, 'test_accuracy': 0.2, 'test_f1': 0.3333333333333333, 'test_precision': 0.3333333333333333, 'test_recall': 0.3333333333333333, 'test_runtime': 0.0348, 'test_samples_per_second': 143.491, 'test_steps_per_second': 28.698})

In [111]:
trainer.predict(new_test).metrics



{'test_loss': 0.7406556606292725,
 'test_accuracy': 0.2,
 'test_f1': 0.3333333333333333,
 'test_precision': 0.3333333333333333,
 'test_recall': 0.3333333333333333,
 'test_runtime': 0.0346,
 'test_samples_per_second': 144.6,
 'test_steps_per_second': 28.92}

In [112]:
import torch
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
print(model)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [113]:


pred_labels = trainer.predict(test_dataset).predictions.argmax(axis=1)
mse = mean_squared_error(test_labels, pred_labels)
print(f"MSE: {mse:.4f}")



MSE: 0.3521


In [117]:
texts = [
    "It's an economic and convenient service. I like the convenience of the app and being able to track just where my driver is. I've never had any situations that I felt were unsafe or threatening.",
    "The Uber drivers that we used were OK. One was creepy and the other two ladies were nice. The one thing they all had in common, they were SLOW. They drive way below the speed limit and I felt like I was never going to get to my destination.",
    "Uber Driver that ride me yesterday was been rude and his car is dirty"
]


for text in texts:
    
    encoding = tokenizer(text, truncation=True, padding=True, return_tensors='pt')

    
    output = model(encoding['input_ids'], encoding['attention_mask'])
    predicted_label = torch.argmax(output[0]).item()

    
    print(f"Review: {text}")
    if predicted_label == 1:
        print("The comment is unfair\n")
    else:
        print("The comment is fair\n")

Review: It's an economic and convenient service. I like the convenience of the app and being able to track just where my driver is. I've never had any situations that I felt were unsafe or threatening.
The comment is fair

Review: The Uber drivers that we used were OK. One was creepy and the other two ladies were nice. The one thing they all had in common, they were SLOW. They drive way below the speed limit and I felt like I was never going to get to my destination.
The comment is fair

Review: Uber Driver that ride me yesterday was been rude and his car is dirty
The comment is fair

