In [41]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import pandas as pd
from datasets import load_dataset

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pickle
from keras.models import load_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [3]:
# import tensorflow as tf
# tf.config.list_physical_devices('GPU')

In [4]:
# import torch
# device = "cuda" if torch.cuda.is_available() else "cpu"
# device

In [35]:
# dataset_dict = load_dataset("shawhin/phishing-site-classification")
dataset_file = "C:\\Users\\Spandan\\Downloads\\Compressed\\Sentence Types - Question, Command and Statement\\Sentence Types - Question, Command and Statement.csv"
model_export_path = "D:\PROJECTS\TensorFlow Model Exports\LSTM Complex Question Detection\lstm_model_complex_768_pre_padding.h5"
lr = 2e-4
batch_size = 8
num_epochs = 5
dataset_size = 1000

model_path = "google-bert/bert-base-uncased"

In [7]:
df_raw = pd.read_csv(dataset_file)

In [8]:
df_raw.columns = ["text", "labels"]

In [9]:
df_raw.labels.unique()

array(['command', 'statement', 'question'], dtype=object)

In [10]:
df_raw.labels.value_counts()

labels
question     130655
statement     78479
command         932
Name: count, dtype: int64

In [11]:
df = df_raw.loc[df_raw["labels"] != "command"]

In [12]:
len(df)

209134

In [13]:
df.labels.value_counts()

labels
question     130655
statement     78479
Name: count, dtype: int64

In [14]:
df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)


In [15]:
df.head()

Unnamed: 0,text,labels
1,it's from Birmingham to em London Euston please,0
2,the 8th of October,0
3,i'd like to leave on the 7:33 train,0
4,there's the 7:33 from Birmingham New Street,0
5,i'm just going to check to see what's your che...,0


In [16]:
question_df = df.loc[df["labels"] == 1]
statement_df = df.loc[df["labels"] == 0]

In [17]:
min_size = min(len(question_df), len(statement_df))
min_size

78479

In [18]:
question_df = question_df.iloc[:min_size]
statement_df = statement_df.iloc[:min_size]

In [19]:
print(len(question_df))
print(len(statement_df))

78479
78479


In [20]:
merged_df = pd.concat([question_df, statement_df], axis = 0)
merged_df.head()

Unnamed: 0,text,labels
42,is that Birmingham New Street,1
43,do you hold a current debit or credit card,1
44,do you have a rail card,1
45,would you like smoking or non-smoking,1
46,and do you have any seat preference,1


In [21]:
print(question_df.shape)
print(merged_df.shape)

(78479, 2)
(156958, 2)


In [22]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

In [23]:
merged_df.labels.value_counts()

labels
0    78479
1    78479
Name: count, dtype: int64

In [24]:
df_small = merged_df.iloc[:dataset_size]
df_small = df_small.reset_index(drop=True)

In [25]:
df_small.labels.value_counts()

labels
0    526
1    474
Name: count, dtype: int64

In [26]:
len(df_small)

1000

In [27]:
df_small.head()

Unnamed: 0,text,labels
0,"Moreover, facing a German military advance, L...",0
1,The RICO Act is still used today for all orga...,0
2,"If all three parameters are used, they are spe...",1
3,The Xbox 360 launched with 14 games in North A...,1
4,"Like the other groups, examples of their musi...",0


In [28]:
dataset = Dataset.from_pandas(df_small).train_test_split(test_size=0.10)
# dataset_train_and_validation = dataset["train"].train_test_split(test_size=0.10)

In [29]:
dataset_dict = DatasetDict({"train": dataset["train"], "validation": dataset["test"]})

In [30]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 100
    })
})

In [31]:
dataset_dict["train"].to_pandas().labels.value_counts()

labels
0    478
1    422
Name: count, dtype: int64

In [32]:
# dataset_dict["test"].to_pandas().labels.value_counts()

In [33]:
dataset_dict["train"][:10]

{'text': ['What dictated that when a parent could not contract a legal marriage, offspring would follow the father',
  " Additionally, the media occasionally discusses the idea that Scotland's two biggest teams, Celtic and Rangers, should or will take part in the Premier League, but nothing has come of these discussions.",
  ' Other notable duck-producing countries in the Far East include Vietnam, Thailand, Malaysia, Myanmar, Indonesia and South Korea (12% in total).',
  'When did the British claim Queen Elizabeth Land in Antarctica?',
  ' After melting, homogenization and refining (removal of bubbles), the glass is formed.',
  ' They disagreed with one another concerning the presence of Christ and his body and blood in Holy Communion.',
  " These urban foxes are noticeably bolder than their country cousins, sharing the pavement with pedestrians and raising cubs in people's backyards.",
  'At about the same time, Charles Coffin, leading the Thomson-Houston Electric Company, acquired a 

In [36]:
# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
vocabulary_size = 30522
embedding_dimension = 768
# min_len = 3
# load model with binary classification head

In [42]:
def create_model():
    model = Sequential()
    model.add(Embedding(vocabulary_size, embedding_dimension, input_length=embedding_dimension))
    model.add(LSTM(128,activation='relu',return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(128,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [43]:
model = create_model()
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 768, 768)          23440896  
                                                                 
 lstm_1 (LSTM)               (None, 768, 128)          459264    
                                                                 
 dropout (Dropout)           (None, 768, 128)          0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dropout_2 (Dropout)         (None, 32)               

In [44]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["text"], truncation=True)
    # return tokenizer(examples["text"], truncation=False)

#padded_dataset = sequence.pad_sequences(X_train, maxlen=min_len)
# preprocess all datasets
tokenized_data_raw = dataset_dict.map(preprocess_function, batched=True)


Map: 100%|██████████████████████████████████████████████████████████████████| 900/900 [00:00<00:00, 6682.01 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 5233.98 examples/s]


In [45]:
tokenized_data_raw

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [46]:
df_train = tokenized_data_raw["train"].to_pandas()
df_train.head()

Unnamed: 0,text,labels,input_ids,token_type_ids,attention_mask
0,What dictated that when a parent could not con...,1,"[101, 2054, 23826, 2008, 2043, 1037, 6687, 207...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"Additionally, the media occasionally discusse...",0,"[101, 5678, 1010, 1996, 2865, 5681, 15841, 199...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Other notable duck-producing countries in the...,0,"[101, 2060, 3862, 9457, 1011, 5155, 3032, 1999...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,When did the British claim Queen Elizabeth Lan...,1,"[101, 2043, 2106, 1996, 2329, 4366, 3035, 3870...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,"After melting, homogenization and refining (r...",0,"[101, 2044, 13721, 1010, 24004, 6914, 3989, 19...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [47]:
train_data = np.asarray(sequence.pad_sequences(df_train["input_ids"], maxlen=embedding_dimension, padding='pre')) # Post gives bad result
target_data = np.asarray(df_train["labels"])

In [290]:
# tokenized_data_padded = DatasetDict(tokenized_data_padded)
# tokenized_data_padded

In [291]:
# training_data = tokenized_data_padded["train"]
# testing_data = tokenized_data_padded["test"]

In [292]:
# len(training_data)

In [293]:
# testing_data[0][:20]

In [50]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, 
                                                                 keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, 
                                     references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                     references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [51]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.2, random_state=2024)

In [52]:
X_train[:10]

array([[   0,    0,    0, ..., 3094, 1029,  102],
       [   0,    0,    0, ..., 2124, 2004,  102],
       [   0,    0,    0, ..., 2377, 1029,  102],
       ...,
       [   0,    0,    0, ..., 1000, 1007,  102],
       [   0,    0,    0, ..., 2069, 2051,  102],
       [   0,    0,    0, ..., 2449, 3908,  102]])

In [53]:
model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size)

Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

In [240]:
# def save_model(model, model_location):
#     save_classifier = open(model_location, "wb")
#     pickle.dump(model, save_classifier)
#     save_classifier.close()

# def load_trained_model(model_location):
#     loaded_model = load_model(model_location)
#     return loaded_model

In [302]:
# save_model(model, model_export_path)
model.save_weights(model_export_path)

In [303]:
# model_loaded = load_trained_model(model_export_path)
loaded_model = create_model()
loaded_model.load_weights(model_export_path)
# with open(model_export_path, 'rb') as file:
#     model_loaded = pickle.load(file)

In [304]:
X_test[:10]

array([[    0,     0,     0, ...,  1997, 26315,   102],
       [    0,     0,     0, ...,  2942,  4812,   102],
       [    0,     0,     0, ..., 14000,  1012,   102],
       ...,
       [    0,     0,     0, ...,  1055,  2171,   102],
       [    0,     0,     0, ...,  2320,  1012,   102],
       [    0,     0,     0, ...,  6153,  8223,   102]])

In [244]:
# apply model to validation dataset
predictions = loaded_model.predict(X_test)

# Extract the logits and labels fr
pred_0_1 = np.array([[0 if pred<0.5 else 1] for pred in predictions])

print(len(pred_0_1))
print(len(y_test))
# Use your compute_metrics function
accuracy = accuracy_score(y_test, pred_0_1)
# metrics = compute_metrics((logits, labels))
print(accuracy)

# BERT {'Accuracy': 0.955, 'AUC': 0.989}
#DistilBERT {'Accuracy': 0.944, 'AUC': 0.994}

180
180
0.5777777777777777


In [245]:
confusion_matrix(pred_0_1, y_test)

array([[104,  76],
       [  0,   0]], dtype=int64)

In [124]:
def tokenizer_text_single(text):
    # return tokenized text with truncation
    ebmedding = [tokenizer(text, truncation=True)["input_ids"]]
    # print(x)
    return np.asarray(sequence.pad_sequences(ebmedding, maxlen=embedding_dimension, padding='post'))

In [132]:
sentence = "What developed from the mammalian odor pathways?"
sentence = "Could you please tell me the direction of the retaurant?"
sentence = "How do you know this?"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question[0][:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [133]:
res = loaded_model.predict([tokenized_question])
res



array([[0.9999062]], dtype=float32)

In [134]:
sentence = "Burke received a vote of thanks from the Commons for his services in the Hastings Trial and he immediately resigned his seat"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question[0][:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [136]:
res = loaded_model.predict([tokenized_question])
res



array([[0.00389195]], dtype=float32)

In [305]:
merged_df_test = merged_df.sample(frac=1).reset_index(drop=True)
len(merged_df_test)

156958

In [331]:
merged_df_test.head()

Unnamed: 0,text,labels,embeddings
0,"The following year, it chartered the Royal Ni...",0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,What type of data compression is the converse ...,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"The film was another huge box office smash, g...",0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,"Raised in Chicago, West briefly attended art s...",1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,it's a Super Saver return i think i need,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
# def preprocess_function_df(examples):
#     # return tokenized text with truncation
#     return tokenizer(examples["text"], truncation=True)["input_ids"]

In [389]:
def tokenizer_text_df(text):
    # return tokenized text with truncation
    ebmedding = tokenizer(text, truncation=True)["input_ids"]
    return ebmedding
    # print(x)

In [390]:
tokenized_data = merged_df_test["text"].apply(tokenizer_text_df)
# merged_df_test_embeddings = np.asarray(sequence.pad_sequences(tokenized_data, maxlen=embedding_dimension, padding='pre'))

ValueError: `sequences` must be a list of iterables. Found non-iterable: 101

In [391]:
tokenized_data

0         [101, 1996, 2206, 2095, 1010, 2009, 12443, 199...
1         [101, 2054, 2828, 1997, 2951, 13379, 2003, 199...
2         [101, 1996, 2143, 2001, 2178, 4121, 3482, 2436...
3         [101, 2992, 1999, 3190, 1010, 2225, 4780, 3230...
4         [101, 2009, 1005, 1055, 1037, 3565, 3828, 2099...
                                ...                        
156953    [101, 2574, 1010, 2474, 6633, 19968, 2063, 199...
156954    [101, 1996, 2744, 1000, 7680, 27549, 1000, 200...
156955    [101, 2043, 2106, 7072, 2991, 1999, 6956, 1997...
156956    [101, 1999, 3655, 2060, 2084, 4199, 1010, 1996...
156957    [101, 5709, 22975, 2546, 1997, 3019, 3806, 100...
Name: text, Length: 156958, dtype: object

In [401]:
padded_encoding_test = sequence.pad_sequences(tokenized_data, maxlen=embedding_dimension, padding='pre')

In [402]:
padded_encoding_test

array([[    0,     0,     0, ...,  2751,  2666,   102],
       [    0,     0,     0, ..., 11259,  8358,   102],
       [    0,     0,     0, ...,  4969,  1012,   102],
       ...,
       [    0,     0,     0, ...,  3354,  2586,   102],
       [    0,     0,     0, ...,  8170,  1029,   102],
       [    0,     0,     0, ...,  3088,  1012,   102]])

In [403]:
# merged_df_test["embedding1d"] = padded_encoding_test

In [404]:
merged_df_test.head()

Unnamed: 0,text,labels,embeddings,embedding1d
0,"The following year, it chartered the Royal Ni...",0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
1,What type of data compression is the converse ...,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
2,"The film was another huge box office smash, g...",0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
3,"Raised in Chicago, West briefly attended art s...",1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
4,it's a Super Saver return i think i need,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0


In [407]:
padded_encoding_test_10k = padded_encoding_test[:10000]

In [408]:
len(padded_encoding_test_10k)

10000

In [410]:
padded_encoding_test_10k

array([[    0,     0,     0, ...,  2751,  2666,   102],
       [    0,     0,     0, ..., 11259,  8358,   102],
       [    0,     0,     0, ...,  4969,  1012,   102],
       ...,
       [    0,     0,     0, ...,  1996,  7095,   102],
       [    0,     0,     0, ...,  1996,  2479,   102],
       [    0,     0,     0, ...,  1997,  3163,   102]])

In [323]:
# print(np.asarray(merged_df_test_10k["embeddings"][0]).astype('float32'))

In [409]:
# print(X_test)

In [411]:
##### Time 
from time import time
start_time = time()
loaded_model.predict(padded_encoding_test_10k)
end_time = time()



In [412]:
time_diff = end_time - start_time
time_diff

21.372161388397217