In [91]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import pandas as pd
from datasets import load_dataset

In [31]:
# import tensorflow as tf
# tf.config.list_physical_devices('GPU')

In [32]:
# import torch
# device = "cuda" if torch.cuda.is_available() else "cpu"
# device

In [33]:
# dataset_dict = load_dataset("shawhin/phishing-site-classification")
dataset_file = "C:\\Users\\Spandan\\Downloads\\Compressed\\Sentence Types - Question, Command and Statement\\Sentence Types - Question, Command and Statement.csv"
model_export_path = "D:\PROJECTS\TensorFlow Model Exports\BERT Question Detection with validation"
lr = 2e-4
batch_size = 8
num_epochs = 5
dataset_size = 1000

# define pre-trained model path
model_path = "google-bert/bert-base-uncased"

In [34]:
df_raw = pd.read_csv(dataset_file)

In [35]:
df_raw.columns = ["text", "labels"]

In [36]:
df_raw.labels.unique()

array(['command', 'statement', 'question'], dtype=object)

In [37]:
df_raw.labels.value_counts()

labels
question     130655
statement     78479
command         932
Name: count, dtype: int64

In [38]:
df = df_raw.loc[df_raw["labels"] != "command"]

In [39]:
len(df)

209134

In [40]:
df.labels.value_counts()

labels
question     130655
statement     78479
Name: count, dtype: int64

In [41]:
df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)


In [42]:
df.head()

Unnamed: 0,text,labels
1,it's from Birmingham to em London Euston please,0
2,the 8th of October,0
3,i'd like to leave on the 7:33 train,0
4,there's the 7:33 from Birmingham New Street,0
5,i'm just going to check to see what's your che...,0


In [43]:
question_df = df.loc[df["labels"] == 1]
statement_df = df.loc[df["labels"] == 0]

In [44]:
min_size = min(len(question_df), len(statement_df))
min_size

78479

In [45]:
question_df = question_df.iloc[:min_size]
statement_df = statement_df.iloc[:min_size]

In [46]:
print(len(question_df))
print(len(statement_df))

78479
78479


In [47]:
merged_df = pd.concat([question_df, statement_df], axis = 0)
merged_df.head()

Unnamed: 0,text,labels
42,is that Birmingham New Street,1
43,do you hold a current debit or credit card,1
44,do you have a rail card,1
45,would you like smoking or non-smoking,1
46,and do you have any seat preference,1


In [48]:
print(question_df.shape)
print(merged_df.shape)

(78479, 2)
(156958, 2)


In [49]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

In [50]:
merged_df.labels.value_counts()

labels
1    78479
0    78479
Name: count, dtype: int64

In [51]:
df_small = merged_df.iloc[:dataset_size]
df_small = df_small.reset_index(drop=True)

In [52]:
df_small.labels.value_counts()

labels
0    501
1    499
Name: count, dtype: int64

In [53]:
len(df_small)

1000

In [54]:
df_small.head()

Unnamed: 0,text,labels
0,Who is powerless against endoheretics?,1
1,Non-white minorities included Spanish-speakin...,0
2,"Prior to the second world war, birth control w...",0
3,Midtown Raleigh is a residential and commercia...,1
4,Special attention is given to the assertion t...,0


In [56]:
dataset = Dataset.from_pandas(df_small).train_test_split(test_size=0.10)
dataset_train_and_validation = dataset["train"].train_test_split(test_size=0.10)

In [57]:
dataset_dict = DatasetDict({"train": dataset_train_and_validation["train"], "test": dataset["test"], "validation": dataset_train_and_validation["test"]})

In [58]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 810
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 90
    })
})

In [59]:
dataset_dict["train"].to_pandas().labels.value_counts()

labels
1    417
0    393
Name: count, dtype: int64

In [60]:
dataset_dict["test"].to_pandas().labels.value_counts()

labels
0    53
1    47
Name: count, dtype: int64

In [61]:
dataset_dict["train"][:10]

{'text': ['Most sexually reproducing organisms are diploid, with paired chromosomes, but doubling of their chromosome number may occur due to errors in cytokinesis',
  'What causes the ice to break and crack loudly?',
  ' These occurred at 319 million years ago and 192 million years ago.',
  ' Switzerland is a relatively easy place to do business, currently ranking 20th of 189 countries in the Ease of Doing Business Index.',
  'The psychoacoustic masking codec was first proposed in 1979, apparently independently, by Manfred R.',
  'Where do dunlin Calidris alpina migrate to',
  'This means that a 5% reduction in operating voltage will more than double the life of the bulb, at the expense of reducing its light output by about 16%. When did light bulb manufacturers establish a cartel to limit bulb life',
  "As economic and demographic methods were applied to the study of history, the trend was increasingly to see the late Middle Ages as a period of recession and crisis. What wasn't the n

In [62]:
# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# load model with binary classification head
id2label = {0: "statement", 1: "question"}
label2id = {"statement": 0, "question": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id,)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
# model.base_model

In [64]:
# [name for name, param in model.base_model.named_parameters()]

In [65]:
# freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [66]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["text"], truncation=True)

# preprocess all datasets
tokenized_data = dataset_dict.map(preprocess_function, batched=True)


Map: 100%|██████████████████████████████████████████████████████████████████| 810/810 [00:00<00:00, 6858.14 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3439.17 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 90/90 [00:00<00:00, 3681.58 examples/s]


In [67]:
tokenized_data["train"]

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 810
})

In [68]:
tokenized_data["train"]

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 810
})

In [69]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [70]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, 
                                                                 keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, 
                                     references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                     references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [71]:
# hyperparameters

training_args = TrainingArguments(
    output_dir="question_detection",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [72]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# trainer.train()

  trainer = Trainer(


In [73]:
# trainer.save_model(model_export_path)

In [74]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

# Raw - {'Accuracy': 0.378, 'AUC': 0.451}
# >> {'Accuracy': 0.955, 'AUC': 0.989}

{'Accuracy': 0.378, 'AUC': 0.451}


In [75]:
# Load Model
model2 = AutoModelForSequenceClassification.from_pretrained(model_export_path)
tokenizer2 = AutoTokenizer.from_pretrained(model_export_path)

trainer2 = Trainer(
    model=model2,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer2,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer2 = Trainer(


In [76]:
# apply model to validation dataset
predictions = trainer2.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

# >> {'Accuracy': 0.889, 'AUC': 0.946}

{'Accuracy': 0.933, 'AUC': 0.978}


In [183]:
def tokenizer_text_single(text):
    # return tokenized text with truncation
    return tokenizer(text, truncation=True)

In [192]:
sentence = "What developed from the mammalian odor pathways?"
sentence = "Could you please tell me the direction of the retaurant?"
sentence = "How do you know this?"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question

{'input_ids': [101, 2129, 2079, 2017, 2113, 2023, 1029, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [193]:
res = trainer2.predict([tokenized_question])
res

PredictionOutput(predictions=array([[-2.417921 ,  2.8041248]], dtype=float32), label_ids=None, metrics={'test_model_preparation_time': 0.0073, 'test_runtime': 0.1147, 'test_samples_per_second': 8.718, 'test_steps_per_second': 8.718})

In [188]:
sentence = "Burke received a vote of thanks from the Commons for his services in the Hastings Trial and he immediately resigned his seat"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question

{'input_ids': [101, 9894, 2363, 1037, 3789, 1997, 4283, 2013, 1996, 7674, 2005, 2010, 2578, 1999, 1996, 12296, 3979, 1998, 2002, 3202, 5295, 2010, 2835, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [189]:
res = trainer2.predict([tokenized_question])
res

PredictionOutput(predictions=array([[ 2.6326413, -2.712778 ]], dtype=float32), label_ids=None, metrics={'test_model_preparation_time': 0.0073, 'test_runtime': 0.1202, 'test_samples_per_second': 8.317, 'test_steps_per_second': 8.317})

In [78]:
############
#Testing Time Test

In [79]:
merged_df_test = merged_df.sample(frac=1).reset_index(drop=True)
len(merged_df_test)

156958

In [83]:
merged_df_test = merged_df_test.iloc[:10000]

In [84]:
merged_df_test.head()

Unnamed: 0,text,labels
0,What was Darrell Davis' nickname?,1
1,"Even in the late 17th century, after explorer...",0
2,What year did Austria outlaw the death penalty,1
3,Under such influential United States founders...,0
4,What is thought to be the first fictionalized ...,1


In [93]:
def tokenizer_text_df(text):
    # return tokenized text with truncation
    ebmedding = tokenizer(text, truncation=True)
    return ebmedding
    # print(x)

In [94]:
tokenized_data = merged_df_test["text"].apply(tokenizer_text_df)
# merged_df_test_embeddings = np.asarray(sequence.pad_sequences(tokenized_data, maxlen=embedding_dimension, padding='pre'))

In [95]:
tokenized_data

0       [input_ids, token_type_ids, attention_mask]
1       [input_ids, token_type_ids, attention_mask]
2       [input_ids, token_type_ids, attention_mask]
3       [input_ids, token_type_ids, attention_mask]
4       [input_ids, token_type_ids, attention_mask]
                           ...                     
9995    [input_ids, token_type_ids, attention_mask]
9996    [input_ids, token_type_ids, attention_mask]
9997    [input_ids, token_type_ids, attention_mask]
9998    [input_ids, token_type_ids, attention_mask]
9999    [input_ids, token_type_ids, attention_mask]
Name: text, Length: 10000, dtype: object

NameError: name 'embedding_dimension' is not defined

In [None]:
# padded_encoding_test

In [None]:
# merged_df_test.head()

In [None]:
# padded_encoding_test_10k = padded_encoding_test[:10000]
# len(padded_encoding_test_10k)

In [None]:
# padded_encoding_test_10k

In [96]:
##### Time 
from time import time
start_time = time()
pred_data = trainer2.predict(tokenized_data)
end_time = time()

KeyboardInterrupt: 

In [None]:
time_diff = end_time - start_time
time_diff