In [1]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
# import tensorflow as tf
# tf.config.list_physical_devices('GPU')

In [29]:
# import torch
# device = "cuda" if torch.cuda.is_available() else "cpu"
# device

In [2]:
# dataset_dict = load_dataset("shawhin/phishing-site-classification")
dataset_file = "C:\\Users\\Spandan\\Downloads\\Compressed\\Sentence Types - Question, Command and Statement\\Sentence Types - Question, Command and Statement.csv"
model_export_path = "D:\PROJECTS\TensorFlow Model Exports\DistilBERT Question Detection"
lr = 2e-4
batch_size = 8
num_epochs = 5
dataset_size = 1000

# define pre-trained model path
model_path = "distilbert/distilbert-base-uncased"

In [3]:
df_raw = pd.read_csv(dataset_file)

In [4]:
df_raw.columns = ["text", "labels"]

In [5]:
df_raw.labels.unique()

array(['command', 'statement', 'question'], dtype=object)

In [6]:
df_raw.labels.value_counts()

labels
question     130655
statement     78479
command         932
Name: count, dtype: int64

In [7]:
df = df_raw.loc[df_raw["labels"] != "command"]

In [8]:
len(df)

209134

In [9]:
df.labels.value_counts()

labels
question     130655
statement     78479
Name: count, dtype: int64

In [10]:
df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)


In [11]:
df.head()

Unnamed: 0,text,labels
1,it's from Birmingham to em London Euston please,0
2,the 8th of October,0
3,i'd like to leave on the 7:33 train,0
4,there's the 7:33 from Birmingham New Street,0
5,i'm just going to check to see what's your che...,0


In [12]:
question_df = df.loc[df["labels"] == 1]
statement_df = df.loc[df["labels"] == 0]

In [13]:
min_size = min(len(question_df), len(statement_df))
min_size

78479

In [14]:
question_df = question_df.iloc[:min_size]
statement_df = statement_df.iloc[:min_size]

In [15]:
print(len(question_df))
print(len(statement_df))

78479
78479


In [16]:
merged_df = pd.concat([question_df, statement_df], axis = 0)
merged_df.head()

Unnamed: 0,text,labels
42,is that Birmingham New Street,1
43,do you hold a current debit or credit card,1
44,do you have a rail card,1
45,would you like smoking or non-smoking,1
46,and do you have any seat preference,1


In [17]:
print(question_df.shape)
print(merged_df.shape)

(78479, 2)
(156958, 2)


In [18]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

In [19]:
merged_df.labels.value_counts()

labels
1    78479
0    78479
Name: count, dtype: int64

In [20]:
df_small = merged_df.iloc[:dataset_size]
df_small = df_small.reset_index(drop=True)

In [21]:
df_small.labels.value_counts()

labels
0    507
1    493
Name: count, dtype: int64

In [22]:
len(df_small)

1000

In [23]:
df_small.head()

Unnamed: 0,text,labels
0,What may have been buried by the same prehisto...,1
1,The Finnish and Baltic invasions began a deter...,1
2,The London Gazette of 17 March 1691 published ...,1
3,4%) although the totals remain relatively smal...,0
4,These microprocessors do not use von Neumann ...,0


In [24]:
dataset = Dataset.from_pandas(df_small).train_test_split(test_size=0.10)
dataset_train_and_validation = dataset["train"].train_test_split(test_size=0.10)

In [25]:
dataset_dict = DatasetDict({"train": dataset_train_and_validation["train"], "test": dataset["test"], "validation": dataset_train_and_validation["test"]})

In [26]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 810
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 90
    })
})

In [27]:
dataset_dict["train"].to_pandas().labels.value_counts()

labels
0    407
1    403
Name: count, dtype: int64

In [28]:
dataset_dict["test"].to_pandas().labels.value_counts()

labels
0    58
1    42
Name: count, dtype: int64

In [29]:
dataset_dict["train"][:10]

{'text': ['" Alexander, on the other hand, was "very plain"',
  ' The energy is recovered when demand is high by releasing the water, with the pump becoming a hydroelectric power generator',
  ' Philostratus takes pains to point out that the celebrated Apollonius of Tyana was definitely not a magus, "despite his special knowledge of the future, his miraculous cures, and his ability to vanish into thin air"',
  'The politics of Zhejiang is structured in a dual party-government system like all other governing institutions in Mainland China.  Who is the Governor not subordinate to',
  ' A 2012 Reddit post written by an anonymous Comcast call center employee eager to share their negative experiences with the public received attention from publications including The Huffington Post',
  '6 million tonne global market',
  'How many of the relief team were from the State Seismological Bureau?',
  ' Experts debate the causes, with some attributing it to speculative flow of money from housing an

In [30]:
# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# load model with binary classification head
id2label = {0: "statement", 1: "question"}
label2id = {"statement": 0, "question": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id,)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
model.base_model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [32]:
# [name for name, param in model.base_model.named_parameters()]

In [33]:
# freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [34]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["text"], truncation=True)

# preprocess all datasets
tokenized_data = dataset_dict.map(preprocess_function, batched=True)


Map: 100%|██████████████████████████████████████████████████████████████████| 810/810 [00:00<00:00, 8672.40 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 4345.03 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 4546.78 examples/s]


In [35]:
tokenized_data["train"]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 810
})

In [36]:
tokenized_data["train"]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 810
})

In [37]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [38]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, 
                                                                 keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, 
                                     references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                     references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [39]:
# hyperparameters

training_args = TrainingArguments(
    output_dir="question_detection",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [40]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# trainer.train()

  trainer = Trainer(


In [41]:
# trainer.save_model(model_export_path)

In [42]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

#DistilBERT Raw {'Accuracy': 0.433, 'AUC': 0.524}
# BERT {'Accuracy': 0.955, 'AUC': 0.989}
#DistilBERT {'Accuracy': 0.944, 'AUC': 0.994}

{'Accuracy': 0.433, 'AUC': 0.524}


In [43]:
# Load Model
model2 = AutoModelForSequenceClassification.from_pretrained(model_export_path)
tokenizer2 = AutoTokenizer.from_pretrained(model_export_path)

trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer2,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer2 = Trainer(


In [44]:
# apply model to validation dataset
predictions = trainer2.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

# >> {'Accuracy': 0.889, 'AUC': 0.946}

{'Accuracy': 0.889, 'AUC': 0.972}


In [74]:
def tokenizer_text_single(text):
    # return tokenized text with truncation
    return tokenizer(text, truncation=True)

In [81]:
sentence = "What developed from the mammalian odor pathways?"
# sentence = "Could you please tell me the direction of the retaurant?"
# sentence = "How do you know this?"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question

{'input_ids': [101, 2054, 2764, 2013, 1996, 26524, 19255, 16910, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [82]:
res = trainer2.predict([tokenized_question])
res

PredictionOutput(predictions=array([[-2.9417567,  3.1534703]], dtype=float32), label_ids=None, metrics={'test_model_preparation_time': 0.0017, 'test_runtime': 0.0693, 'test_samples_per_second': 14.435, 'test_steps_per_second': 14.435})

In [77]:
sentence = "Burke received a vote of thanks from the Commons for his services in the Hastings Trial and he immediately resigned his seat"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question

{'input_ids': [101, 9894, 2363, 1037, 3789, 1997, 4283, 2013, 1996, 7674, 2005, 2010, 2578, 1999, 1996, 12296, 3979, 1998, 2002, 3202, 5295, 2010, 2835, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [78]:
res = trainer2.predict([tokenized_question])
res

PredictionOutput(predictions=array([[ 1.7417043, -1.3585452]], dtype=float32), label_ids=None, metrics={'test_model_preparation_time': 0.0017, 'test_runtime': 0.0852, 'test_samples_per_second': 11.737, 'test_steps_per_second': 11.737})

In [45]:
############
#Testing Time Test

In [46]:
merged_df_test = merged_df.sample(frac=1).reset_index(drop=True)
len(merged_df_test)

156958

In [47]:
merged_df_test = merged_df_test.iloc[:10000]

In [48]:
merged_df_test.head()

Unnamed: 0,text,labels
0,At the age of 21 he settled in Paris. What oth...,1
1,"Sichuan has been historically known as the ""Pr...",0
2,Sean Fennessey of The Village Voice writes th...,0
3,"[note 27] The reliability of these sources, an...",0
4,What helps more than one group of predators in...,1


In [49]:
def tokenizer_text_df(text):
    # return tokenized text with truncation
    ebmedding = tokenizer(text, truncation=True)
    return ebmedding
    # print(x)

In [50]:
tokenized_data = merged_df_test["text"].apply(tokenizer_text_df)
# merged_df_test_embeddings = np.asarray(sequence.pad_sequences(tokenized_data, maxlen=embedding_dimension, padding='pre'))

In [51]:
tokenized_data

0       [input_ids, attention_mask]
1       [input_ids, attention_mask]
2       [input_ids, attention_mask]
3       [input_ids, attention_mask]
4       [input_ids, attention_mask]
                   ...             
9995    [input_ids, attention_mask]
9996    [input_ids, attention_mask]
9997    [input_ids, attention_mask]
9998    [input_ids, attention_mask]
9999    [input_ids, attention_mask]
Name: text, Length: 10000, dtype: object

In [52]:
##### Time 
from time import time
start_time = time()
pred_data = trainer2.predict(tokenized_data)
end_time = time()

KeyboardInterrupt: 

In [None]:
time_diff = end_time - start_time
time_diff