In [1]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import pandas as pd
from datasets import load_dataset

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pickle
from keras.models import load_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [48]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
# import torch
# device = "cuda" if torch.cuda.is_available() else "cpu"
# device

In [46]:
# dataset_dict = load_dataset("shawhin/phishing-site-classification")
dataset_file = "C:\\Users\\Spandan\\Downloads\\Compressed\\Sentence Types - Question, Command and Statement\\Sentence Types - Question, Command and Statement.csv"
model_export_path = "D:\PROJECTS\TensorFlow Model Exports\GRU Question Detection\\gru_model_768_pre_padding.h5"
lr = 2e-4
batch_size = 8
num_epochs = 5
dataset_size = 1000

model_path = "google-bert/bert-base-uncased"

In [5]:
df_raw = pd.read_csv(dataset_file)

In [6]:
df_raw.columns = ["text", "labels"]

In [7]:
df_raw.labels.unique()

array(['command', 'statement', 'question'], dtype=object)

In [8]:
df_raw.labels.value_counts()

labels
question     130655
statement     78479
command         932
Name: count, dtype: int64

In [9]:
df = df_raw.loc[df_raw["labels"] != "command"]

In [10]:
len(df)

209134

In [11]:
df.labels.value_counts()

labels
question     130655
statement     78479
Name: count, dtype: int64

In [12]:
df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)


In [13]:
df.head()

Unnamed: 0,text,labels
1,it's from Birmingham to em London Euston please,0
2,the 8th of October,0
3,i'd like to leave on the 7:33 train,0
4,there's the 7:33 from Birmingham New Street,0
5,i'm just going to check to see what's your che...,0


In [14]:
question_df = df.loc[df["labels"] == 1]
statement_df = df.loc[df["labels"] == 0]

In [15]:
min_size = min(len(question_df), len(statement_df))
min_size

78479

In [16]:
question_df = question_df.iloc[:min_size]
statement_df = statement_df.iloc[:min_size]

In [17]:
print(len(question_df))
print(len(statement_df))

78479
78479


In [18]:
merged_df = pd.concat([question_df, statement_df], axis = 0)
merged_df.head()

Unnamed: 0,text,labels
42,is that Birmingham New Street,1
43,do you hold a current debit or credit card,1
44,do you have a rail card,1
45,would you like smoking or non-smoking,1
46,and do you have any seat preference,1


In [19]:
print(question_df.shape)
print(merged_df.shape)

(78479, 2)
(156958, 2)


In [20]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

In [21]:
merged_df.labels.value_counts()

labels
0    78479
1    78479
Name: count, dtype: int64

In [22]:
df_small = merged_df.iloc[:dataset_size]
df_small = df_small.reset_index(drop=True)

In [23]:
df_small.labels.value_counts()

labels
1    524
0    476
Name: count, dtype: int64

In [24]:
len(df_small)

1000

In [25]:
df_small.head()

Unnamed: 0,text,labels
0,Similar organizations in other countries follo...,0
1,"The abbot and monks, in proximity to the royal...",1
2,The first Digimon anime introduced the Digimon...,0
3,Where did Jordanes live,1
4,To extend and consolidate the dynasty's contro...,0


In [26]:
dataset = Dataset.from_pandas(df_small).train_test_split(test_size=0.10)
# dataset_train_and_validation = dataset["train"].train_test_split(test_size=0.10)

In [27]:
dataset_dict = DatasetDict({"train": dataset["train"], "validation": dataset["test"]})

In [28]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 100
    })
})

In [29]:
dataset_dict["train"].to_pandas().labels.value_counts()

labels
1    469
0    431
Name: count, dtype: int64

In [30]:
# dataset_dict["test"].to_pandas().labels.value_counts()

In [31]:
dataset_dict["train"][:10]

{'text': [' Based on his observation of fossils in a geological stratum in a mountain hundreds of miles from the ocean, he deduced that the land was formed by erosion of the mountains and by deposition of silt.',
  'While the EIc took over all of India there were, two exception the first being Punjab, what was the Second',
  ' Seasons came back into effect and the poles got seasonally colder, but dinosaurs still inhabited this area like the Leaellynasaura which inhabited the polar forests year-round, and many dinosaurs migrated there during summer like Muttaburrasaurus',
  'The Canadian Armed Forces have a total reserve force of approximately 50,000 primary and supplementary that can be called upon in times of national emergency or threat. What troops does The reserve force consists of?',
  ' Penn, U.',
  " The Commonwealth Liberal Party was a fusion of the Free Trade Party and the Protectionist Party in 1909 by the second prime minister, Alfred Deakin, in response to Labor's growing e

In [34]:
# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
vocabulary_size = 30522
embedding_dimension = 768
# min_len = 3
# load model with binary classification head

In [35]:
def create_model():
    model = Sequential()
    model.add(Embedding(vocabulary_size, embedding_dimension, input_length=embedding_dimension))
    model.add(GRU(100)) #Can be same as embedding_dimension, but model will large and inefficient
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [37]:
model = create_model()
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 768, 768)          23440896  
                                                                 
 gru_1 (GRU)                 (None, 100)               261000    
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 23,701,997
Trainable params: 23,701,997
Non-trainable params: 0
_________________________________________________________________


In [38]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["text"], truncation=True)
    # return tokenizer(examples["text"], truncation=False)

#padded_dataset = sequence.pad_sequences(X_train, maxlen=min_len)
# preprocess all datasets
tokenized_data_raw = dataset_dict.map(preprocess_function, batched=True)


Map: 100%|██████████████████████████████████████████████████████████████████| 900/900 [00:00<00:00, 5386.74 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 5618.70 examples/s]


In [39]:
tokenized_data_raw

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [40]:
df_train = tokenized_data_raw["train"].to_pandas()
df_train.head()

Unnamed: 0,text,labels,input_ids,token_type_ids,attention_mask
0,Based on his observation of fossils in a geol...,0,"[101, 2241, 2006, 2010, 8089, 1997, 11954, 199...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,While the EIc took over all of India there wer...,1,"[101, 2096, 1996, 1041, 2594, 2165, 2058, 2035...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Seasons came back into effect and the poles g...,0,"[101, 3692, 2234, 2067, 2046, 3466, 1998, 1996...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,The Canadian Armed Forces have a total reserve...,1,"[101, 1996, 3010, 4273, 2749, 2031, 1037, 2561...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,"Penn, U.",0,"[101, 9502, 1010, 1057, 1012, 102]","[0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1]"


In [41]:
train_data = np.asarray(sequence.pad_sequences(df_train["input_ids"], maxlen=embedding_dimension, padding='pre')) # Post gives bad result
target_data = np.asarray(df_train["labels"])

In [42]:
# target_data

In [43]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, 
                                                                 keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, 
                                     references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                     references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [44]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.2, random_state=2024)

In [45]:
X_train[:10]

array([[    0,     0,     0, ...,  1055,  5704,   102],
       [    0,     0,     0, ...,  6934,  1007,   102],
       [    0,     0,     0, ...,  3007,  2005,   102],
       ...,
       [    0,     0,     0, ..., 10734,  2597,   102],
       [    0,     0,     0, ...,  2025,  2031,   102],
       [    0,     0,     0, ...,  1997,  4868,   102]])

In [49]:
model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1baf3d9df90>

In [50]:
# def save_model(model, model_location):
#     save_classifier = open(model_location, "wb")
#     pickle.dump(model, save_classifier)
#     save_classifier.close()

# def load_trained_model(model_location):
#     loaded_model = load_model(model_location)
#     return loaded_model

In [51]:
# save_model(model, model_export_path)
model.save_weights(model_export_path)

In [52]:
# model_loaded = load_trained_model(model_export_path)
loaded_model = create_model()
loaded_model.load_weights(model_export_path)
# with open(model_export_path, 'rb') as file:
#     model_loaded = pickle.load(file)

In [53]:
X_test[:10]

array([[    0,     0,     0, ...,  1997,  6282,   102],
       [    0,     0,     0, ...,  2000,  2605,   102],
       [    0,     0,     0, ...,  2959,  3648,   102],
       ...,
       [    0,     0,     0, ...,  1997, 13086,   102],
       [    0,     0,     0, ...,  2161,  2698,   102],
       [    0,     0,     0, ...,  2798,  2146,   102]])

In [54]:
# apply model to validation dataset
predictions = loaded_model.predict(X_test)

# Extract the logits and labels fr
pred_0_1 = np.array([[0 if pred<0.5 else 1] for pred in predictions])

print(len(pred_0_1))
print(len(y_test))
# Use your compute_metrics function
accuracy = accuracy_score(y_test, pred_0_1)
# metrics = compute_metrics((logits, labels))
print(accuracy)

# BERT {'Accuracy': 0.955, 'AUC': 0.989}
#DistilBERT {'Accuracy': 0.944, 'AUC': 0.994}

180
180
0.9666666666666667


In [65]:
pred_0_1[:10]

array([[0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0]])

In [66]:
predictions[:10]

array([[1.4822797e-03],
       [4.0233623e-02],
       [2.0423983e-01],
       [2.6908042e-03],
       [9.9998391e-01],
       [9.9999404e-01],
       [1.3738385e-04],
       [9.9838519e-01],
       [9.9968362e-01],
       [1.1137648e-03]], dtype=float32)

In [55]:
confusion_matrix(pred_0_1, y_test)

array([[80,  5],
       [ 1, 94]], dtype=int64)

In [71]:
def tokenizer_text_single(text):
    # return tokenized text with truncation
    ebmedding = [tokenizer(text, truncation=True)["input_ids"]]
    # print(x)
    return np.asarray(sequence.pad_sequences(ebmedding, maxlen=embedding_dimension, padding='pre'))

In [72]:
sentence = "What developed from the mammalian odor pathways?"
sentence = "Could you please tell me the direction of the retaurant?"
sentence = "How do you know this?"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question[0][:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [73]:
res = loaded_model.predict([tokenized_question])
res



array([[0.99979347]], dtype=float32)

In [74]:
sentence = "Burke received a vote of thanks from the Commons for his services in the Hastings Trial and he immediately resigned his seat"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question[0][:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [75]:
res = loaded_model.predict([tokenized_question])
res



array([[0.00368622]], dtype=float32)

In [76]:
merged_df_test = merged_df.sample(frac=1).reset_index(drop=True)
len(merged_df_test)

156958

In [77]:
merged_df_test.head()

Unnamed: 0,text,labels
0,What is Avicenna's name not needed for?,1
1,Who makes up the cast of the annual play based...,1
2,"A proposal in 2007, estimated the cost of bui...",0
3,Where does ice start accululating in a glacier?,1
4,Christianity came to Tuvalu in 1861 when Eleka...,0


In [79]:
merged_df_test = merged_df_test.iloc[:10000]

In [80]:
# def preprocess_function_df(examples):
#     # return tokenized text with truncation
#     return tokenizer(examples["text"], truncation=True)["input_ids"]

In [81]:
def tokenizer_text_df(text):
    # return tokenized text with truncation
    ebmedding = tokenizer(text, truncation=True)["input_ids"]
    return ebmedding
    # print(x)

In [82]:
tokenized_data = merged_df_test["text"].apply(tokenizer_text_df)
# merged_df_test_embeddings = np.asarray(sequence.pad_sequences(tokenized_data, maxlen=embedding_dimension, padding='pre'))

In [83]:
tokenized_data

0       [101, 2054, 2003, 20704, 6610, 9516, 1005, 105...
1       [101, 2040, 3084, 2039, 1996, 3459, 1997, 1996...
2       [101, 1037, 6378, 1999, 2289, 1010, 4358, 1996...
3       [101, 2073, 2515, 3256, 2707, 16222, 20391, 22...
4       [101, 7988, 2234, 2000, 10722, 10175, 2226, 19...
                              ...                        
9995    [101, 2698, 18804, 9153, 3468, 28846, 2015, 20...
9996    [101, 2054, 17976, 3684, 1998, 18559, 1996, 10...
9997    [101, 2054, 4127, 1997, 12261, 2106, 2834, 249...
9998           [101, 7271, 12559, 14083, 2075, 1057, 102]
9999    [101, 2006, 1015, 2244, 1010, 2762, 10836, 373...
Name: text, Length: 10000, dtype: object

In [84]:
padded_encoding_test = sequence.pad_sequences(tokenized_data, maxlen=embedding_dimension, padding='pre')

In [85]:
padded_encoding_test

array([[    0,     0,     0, ...,  2005,  1029,   102],
       [    0,     0,     0, ...,  3077,  1029,   102],
       [    0,     0,     0, ..., 28182,  1012,   102],
       ...,
       [    0,     0,     0, ...,  2037,  2606,   102],
       [    0,     0,     0, ...,  2075,  1057,   102],
       [    0,     0,     0, ..., 10574,  3655,   102]])

In [86]:
# merged_df_test["embedding1d"] = padded_encoding_test

In [87]:
merged_df_test.head()

Unnamed: 0,text,labels
0,What is Avicenna's name not needed for?,1
1,Who makes up the cast of the annual play based...,1
2,"A proposal in 2007, estimated the cost of bui...",0
3,Where does ice start accululating in a glacier?,1
4,Christianity came to Tuvalu in 1861 when Eleka...,0


In [88]:
padded_encoding_test_10k = padded_encoding_test[:10000]

In [89]:
len(padded_encoding_test_10k)

10000

In [90]:
padded_encoding_test_10k

array([[    0,     0,     0, ...,  2005,  1029,   102],
       [    0,     0,     0, ...,  3077,  1029,   102],
       [    0,     0,     0, ..., 28182,  1012,   102],
       ...,
       [    0,     0,     0, ...,  2037,  2606,   102],
       [    0,     0,     0, ...,  2075,  1057,   102],
       [    0,     0,     0, ..., 10574,  3655,   102]])

In [91]:
# print(np.asarray(merged_df_test_10k["embeddings"][0]).astype('float32'))

In [92]:
# print(X_test)

In [93]:
##### Time 
from time import time
start_time = time()
loaded_model.predict(padded_encoding_test_10k)
end_time = time()



In [94]:
time_diff = end_time - start_time
time_diff

21.008394956588745