In [27]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import pandas as pd
from datasets import load_dataset

In [28]:
# import tensorflow as tf
# tf.config.list_physical_devices('GPU')

In [29]:
# import torch
# device = "cuda" if torch.cuda.is_available() else "cpu"
# device

In [30]:
# dataset_dict = load_dataset("shawhin/phishing-site-classification")
dataset_file = "C:\\Users\\Spandan\\Downloads\\Compressed\\Sentence Types - Question, Command and Statement\\Sentence Types - Question, Command and Statement.csv"
model_export_path = "D:\PROJECTS\TensorFlow Model Exports\DistilBERT Question Detection"
lr = 2e-4
batch_size = 8
num_epochs = 5
dataset_size = 1000

# define pre-trained model path
model_path = "distilbert/distilbert-base-uncased"

In [31]:
df_raw = pd.read_csv(dataset_file)

In [32]:
df_raw.columns = ["text", "labels"]

In [33]:
df_raw.labels.unique()

array(['command', 'statement', 'question'], dtype=object)

In [34]:
df_raw.labels.value_counts()

labels
question     130655
statement     78479
command         932
Name: count, dtype: int64

In [35]:
df = df_raw.loc[df_raw["labels"] != "command"]

In [36]:
len(df)

209134

In [37]:
df.labels.value_counts()

labels
question     130655
statement     78479
Name: count, dtype: int64

In [38]:
df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)


In [39]:
df.head()

Unnamed: 0,text,labels
1,it's from Birmingham to em London Euston please,0
2,the 8th of October,0
3,i'd like to leave on the 7:33 train,0
4,there's the 7:33 from Birmingham New Street,0
5,i'm just going to check to see what's your che...,0


In [40]:
question_df = df.loc[df["labels"] == 1]
statement_df = df.loc[df["labels"] == 0]

In [41]:
min_size = min(len(question_df), len(statement_df))
min_size

78479

In [42]:
question_df = question_df.iloc[:min_size]
statement_df = statement_df.iloc[:min_size]

In [43]:
print(len(question_df))
print(len(statement_df))

78479
78479


In [44]:
merged_df = pd.concat([question_df, statement_df], axis = 0)
merged_df.head()

Unnamed: 0,text,labels
42,is that Birmingham New Street,1
43,do you hold a current debit or credit card,1
44,do you have a rail card,1
45,would you like smoking or non-smoking,1
46,and do you have any seat preference,1


In [45]:
print(question_df.shape)
print(merged_df.shape)

(78479, 2)
(156958, 2)


In [46]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

In [47]:
merged_df.labels.value_counts()

labels
0    78479
1    78479
Name: count, dtype: int64

In [48]:
df_small = merged_df.iloc[:dataset_size]
df_small = df_small.reset_index(drop=True)

In [49]:
df_small.labels.value_counts()

labels
1    521
0    479
Name: count, dtype: int64

In [50]:
len(df_small)

1000

In [51]:
df_small.head()

Unnamed: 0,text,labels
0,"For the first time since the late 1980s, Quee...",0
1,Most of Bermuda's black population trace some ...,1
2,The modernist views hold that classical music ...,0
3,In the year 743 the synod in Leptines (Leptine...,0
4,France's major upscale department stores are G...,0


In [53]:
dataset = Dataset.from_pandas(df_small).train_test_split(test_size=0.10)
dataset_train_and_validation = dataset["train"].train_test_split(test_size=0.10)

In [54]:
dataset_dict = DatasetDict({"train": dataset_train_and_validation["train"], "test": dataset["test"], "validation": dataset_train_and_validation["test"]})

In [55]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 810
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 90
    })
})

In [56]:
dataset_dict["train"].to_pandas().labels.value_counts()

labels
1    418
0    392
Name: count, dtype: int64

In [57]:
dataset_dict["test"].to_pandas().labels.value_counts()

labels
1    56
0    44
Name: count, dtype: int64

In [58]:
dataset_dict["train"][:10]

{'text': ['What are rhythms provided by in House music',
  'Early on, American courts, even after the Revolution, often did cite contemporary English cases. When did decisions from English courts begin to be reported?',
  '5 °F), on 7 February 2009',
  'Kit houses were built late in what century?',
  ' Sometimes alloys may exhibit marked differences in behavior even when small amounts of one element are present',
  ' Some of these companies are the Xceed Contact Center, Raya, E Group Connections and C3.',
  'At the Battle of Mars-la-Tours, the Prussian 12th Cavalry Brigade, commanded by General Adalbert von Bredow, conducted a charge against a French artillery battery. The attack was a costly success and came to be known as "von Bredow\'s Death Ride", which was held to prove that cavalry charges could still prevail on the battlefield. Use of traditional cavalry on the battlefields of 1914 proved to be disastrous, due to accurate, long-range rifle fire, machine-guns and artillery. Von B

In [59]:
# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# load model with binary classification head
id2label = {0: "statement", 1: "question"}
label2id = {"statement": 0, "question": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id,)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
model.base_model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [61]:
# [name for name, param in model.base_model.named_parameters()]

In [62]:
# freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [63]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["text"], truncation=True)

# preprocess all datasets
tokenized_data = dataset_dict.map(preprocess_function, batched=True)


Map: 100%|██████████████████████████████████████████████████████████████████| 810/810 [00:00<00:00, 7880.43 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 5866.57 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 30208.66 examples/s]


In [64]:
tokenized_data["train"]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 810
})

In [65]:
tokenized_data["train"]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 810
})

In [66]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [67]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, 
                                                                 keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, 
                                     references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                     references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [68]:
# hyperparameters

training_args = TrainingArguments(
    output_dir="question_detection",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [69]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,0.5383,0.365519,0.89,0.962
2,0.3514,0.266711,0.9,0.98
3,0.2879,0.231819,0.93,0.982
4,0.2666,0.208922,0.92,0.984
5,0.2437,0.196206,0.9,0.983


TrainOutput(global_step=510, training_loss=0.3375548942416322, metrics={'train_runtime': 505.351, 'train_samples_per_second': 8.014, 'train_steps_per_second': 1.009, 'total_flos': 92850332343888.0, 'train_loss': 0.3375548942416322, 'epoch': 5.0})

In [70]:
trainer.save_model(model_export_path)

In [71]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

# BERT {'Accuracy': 0.955, 'AUC': 0.989}
#DistilBERT {'Accuracy': 0.944, 'AUC': 0.994}

{'Accuracy': 0.944, 'AUC': 0.994}


In [72]:
# Load Model
model2 = AutoModelForSequenceClassification.from_pretrained(model_export_path)
tokenizer2 = AutoTokenizer.from_pretrained(model_export_path)

trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer2,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer2 = Trainer(


In [73]:
# apply model to validation dataset
predictions = trainer2.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

# >> {'Accuracy': 0.889, 'AUC': 0.946}

{'Accuracy': 0.944, 'AUC': 0.994}


In [74]:
def tokenizer_text_single(text):
    # return tokenized text with truncation
    return tokenizer(text, truncation=True)

In [81]:
sentence = "What developed from the mammalian odor pathways?"
# sentence = "Could you please tell me the direction of the retaurant?"
# sentence = "How do you know this?"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question

{'input_ids': [101, 2054, 2764, 2013, 1996, 26524, 19255, 16910, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [82]:
res = trainer2.predict([tokenized_question])
res

PredictionOutput(predictions=array([[-2.9417567,  3.1534703]], dtype=float32), label_ids=None, metrics={'test_model_preparation_time': 0.0017, 'test_runtime': 0.0693, 'test_samples_per_second': 14.435, 'test_steps_per_second': 14.435})

In [77]:
sentence = "Burke received a vote of thanks from the Commons for his services in the Hastings Trial and he immediately resigned his seat"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question

{'input_ids': [101, 9894, 2363, 1037, 3789, 1997, 4283, 2013, 1996, 7674, 2005, 2010, 2578, 1999, 1996, 12296, 3979, 1998, 2002, 3202, 5295, 2010, 2835, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [78]:
res = trainer2.predict([tokenized_question])
res

PredictionOutput(predictions=array([[ 1.7417043, -1.3585452]], dtype=float32), label_ids=None, metrics={'test_model_preparation_time': 0.0017, 'test_runtime': 0.0852, 'test_samples_per_second': 11.737, 'test_steps_per_second': 11.737})