In [340]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import pandas as pd
from datasets import load_dataset

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pickle
from keras.models import load_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [341]:
# import tensorflow as tf
# tf.config.list_physical_devices('GPU')

In [342]:
# import torch
# device = "cuda" if torch.cuda.is_available() else "cpu"
# device

In [343]:
# dataset_dict = load_dataset("shawhin/phishing-site-classification")
dataset_file = "C:\\Users\\Spandan\\Downloads\\Compressed\\Sentence Types - Question, Command and Statement\\Sentence Types - Question, Command and Statement.csv"
model_export_path = "D:\PROJECTS\TensorFlow Model Exports\LSTM Simple Question Detection\\lstm_model_768_pre_padding.h5"
lr = 2e-4
batch_size = 8
num_epochs = 5
dataset_size = 1000

model_path = "google-bert/bert-base-uncased"

In [344]:
df_raw = pd.read_csv(dataset_file)

In [345]:
df_raw.columns = ["text", "labels"]

In [346]:
df_raw.labels.unique()

array(['command', 'statement', 'question'], dtype=object)

In [347]:
df_raw.labels.value_counts()

labels
question     130655
statement     78479
command         932
Name: count, dtype: int64

In [348]:
df = df_raw.loc[df_raw["labels"] != "command"]

In [349]:
len(df)

209134

In [350]:
df.labels.value_counts()

labels
question     130655
statement     78479
Name: count, dtype: int64

In [351]:
df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)


In [352]:
df.head()

Unnamed: 0,text,labels
1,it's from Birmingham to em London Euston please,0
2,the 8th of October,0
3,i'd like to leave on the 7:33 train,0
4,there's the 7:33 from Birmingham New Street,0
5,i'm just going to check to see what's your che...,0


In [353]:
question_df = df.loc[df["labels"] == 1]
statement_df = df.loc[df["labels"] == 0]

In [354]:
min_size = min(len(question_df), len(statement_df))
min_size

78479

In [355]:
question_df = question_df.iloc[:min_size]
statement_df = statement_df.iloc[:min_size]

In [356]:
print(len(question_df))
print(len(statement_df))

78479
78479


In [357]:
merged_df = pd.concat([question_df, statement_df], axis = 0)
merged_df.head()

Unnamed: 0,text,labels
42,is that Birmingham New Street,1
43,do you hold a current debit or credit card,1
44,do you have a rail card,1
45,would you like smoking or non-smoking,1
46,and do you have any seat preference,1


In [358]:
print(question_df.shape)
print(merged_df.shape)

(78479, 2)
(156958, 2)


In [359]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

In [360]:
merged_df.labels.value_counts()

labels
0    78479
1    78479
Name: count, dtype: int64

In [361]:
df_small = merged_df.iloc[:dataset_size]
df_small = df_small.reset_index(drop=True)

In [362]:
df_small.labels.value_counts()

labels
0    518
1    482
Name: count, dtype: int64

In [363]:
len(df_small)

1000

In [364]:
df_small.head()

Unnamed: 0,text,labels
0,"Thus, the popular association of infrared rad...",0
1,For Kanye to make an album called College Dro...,0
2,More advanced memory management chips (MMC) s...,0
3,Pilot Mountain and South Mountains are located...,1
4,"Most records at NARA are in the public domain,...",0


In [365]:
dataset = Dataset.from_pandas(df_small).train_test_split(test_size=0.10)
# dataset_train_and_validation = dataset["train"].train_test_split(test_size=0.10)

In [366]:
dataset_dict = DatasetDict({"train": dataset["train"], "validation": dataset["test"]})

In [367]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 100
    })
})

In [368]:
dataset_dict["train"].to_pandas().labels.value_counts()

labels
0    470
1    430
Name: count, dtype: int64

In [369]:
# dataset_dict["test"].to_pandas().labels.value_counts()

In [370]:
dataset_dict["train"][:10]

{'text': [' The sexual haploid phase of embryophytes, known as the gametophyte, nurtures the developing diploid embryo sporophyte within its tissues for at least part of its life, even in the seed plants, where the gametophyte itself is nurtured by its parent sporophyte',
  'Why were flights delayed and diverted',
  'Which Late Middle Age English kings kept their own troupes of professional actors?',
  ' in 2014, a 260% increase since 2009.',
  " In 1169, as the Kievan Rus' state was full of internal conflict, Andrei Bogolyubsky of Vladimir sacked the city of Kiev",
  ' The city subsidises an organisation for amateur education in arts aimed at all inhabitants (Utrechts Centrum voor de Kunsten), as does the university for its staff and students.',
  "Jackson's TAV Music merged with whom?",
  'In Indian philosophy, Yoga is among other things, the name of one of the six āstika philosophical schools. What concept does yoga accept the differentiates it from Samkhya',
  "The British high-def

In [371]:
# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
vocabulary_size = 30522
embedding_dimension = 768
# min_len = 3
# load model with binary classification head

In [372]:
def create_model():
    model = Sequential()
    model.add(Embedding(vocabulary_size, embedding_dimension, input_length=embedding_dimension))
    model.add(LSTM(100)) #Can be same as embedding_dimension, but model will large and inefficient
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [373]:
model = create_model()
model.summary()

In [374]:
# model.base_model

In [375]:
# [name for name, param in model.base_model.named_parameters()]

In [376]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["text"], truncation=True)
    # return tokenizer(examples["text"], truncation=False)

#padded_dataset = sequence.pad_sequences(X_train, maxlen=min_len)
# preprocess all datasets
tokenized_data_raw = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████| 900/900 [00:00<00:00, 10614.79 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 4492.23 examples/s]


In [377]:
tokenized_data_raw

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [378]:
df_train = tokenized_data_raw["train"].to_pandas()
df_train.head()

Unnamed: 0,text,labels,input_ids,attention_mask
0,"The sexual haploid phase of embryophytes, kno...",0,"[101, 1996, 4424, 5292, 24759, 9314, 4403, 199...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,Why were flights delayed and diverted,1,"[101, 2339, 2020, 7599, 8394, 1998, 18356, 102]","[1, 1, 1, 1, 1, 1, 1, 1]"
2,Which Late Middle Age English kings kept their...,1,"[101, 2029, 2397, 2690, 2287, 2394, 5465, 2921...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,"in 2014, a 260% increase since 2009.",0,"[101, 1999, 2297, 1010, 1037, 13539, 1003, 362...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,"In 1169, as the Kievan Rus' state was full of...",0,"[101, 1999, 12904, 2683, 1010, 2004, 1996, 121...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [379]:
train_data = np.asarray(sequence.pad_sequences(df_train["input_ids"], maxlen=embedding_dimension, padding='pre')) # Post gives bad result
target_data = np.asarray(df_train["labels"])

In [380]:
# target_data

In [381]:
# def pad_tokens(collection__):
#     # return [sequence.pad_sequences([x["input_ids"]], maxlen=embedding_dimension, padding='post')[0] for x in collection__]
#     return [sequence.pad_sequences([x["input_ids"]], maxlen=embedding_dimension, padding='post')[0] for x in collection__]
# def pad_collection(coll):
#     padded_tokens = {}
#     for key, v in tokenized_data.items():
#         padded_tokens[key] = []
#         # print(key)
#         mapped = pad_tokens(v)
#         for m in mapped:
#             # print(m)
#             padded_tokens[key].append(m)
#     return padded_tokens
# padded = pad_tokens(tokenized_data_raw["train"][0]["input_ids"])
# padded = pad_tokens(tokenized_data_raw["train"])
# padded
    # for x in v:
    #     print(x["input_ids"])
    #     print(sequence.pad_sequences([x["input_ids"]], maxlen=embedding_dimension, padding='post'))
    # print(sequence.pad_sequences([v[0]["input_ids"]], maxlen=embedding_dimension, padding='post'))
# tokenized_data_padded = tokenized_data.map(lambda ds: sequence.pad_sequences(ds["input_ids"], maxlen=embedding_dimension, padding='post'))
# tokenized_data_padded = tokenized_data.map(pad_tokens)
# tokenized_data_padded
# padded_tokens

In [382]:
# tokenized_data_padded = pad_collection(tokenized_data_raw)


In [289]:
# tokenized_data_raw["test"]["labels"]

In [290]:
# tokenized_data_padded = DatasetDict(tokenized_data_padded)
# tokenized_data_padded

In [291]:
# training_data = tokenized_data_padded["train"]
# testing_data = tokenized_data_padded["test"]

In [292]:
# len(training_data)

In [293]:
# testing_data[0][:20]

In [294]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, 
                                                                 keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, 
                                     references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                     references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [295]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.2, random_state=2024)

In [296]:
X_train[:10]

array([[    0,     0,     0, ...,  6687,  1012,   102],
       [    0,     0,     0, ...,  8294,  1029,   102],
       [    0,     0,     0, ...,  1997,  4331,   102],
       ...,
       [    0,     0,     0, ...,  1998, 12546,   102],
       [    0,     0,     0, ...,  3117,  1029,   102],
       [    0,     0,     0, ...,  2217,  1012,   102]])

In [297]:
model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x23ca428a9e0>

In [240]:
# def save_model(model, model_location):
#     save_classifier = open(model_location, "wb")
#     pickle.dump(model, save_classifier)
#     save_classifier.close()

# def load_trained_model(model_location):
#     loaded_model = load_model(model_location)
#     return loaded_model

In [302]:
# save_model(model, model_export_path)
model.save_weights(model_export_path)

In [303]:
# model_loaded = load_trained_model(model_export_path)
loaded_model = create_model()
loaded_model.load_weights(model_export_path)
# with open(model_export_path, 'rb') as file:
#     model_loaded = pickle.load(file)

In [304]:
X_test[:10]

array([[    0,     0,     0, ...,  1997, 26315,   102],
       [    0,     0,     0, ...,  2942,  4812,   102],
       [    0,     0,     0, ..., 14000,  1012,   102],
       ...,
       [    0,     0,     0, ...,  1055,  2171,   102],
       [    0,     0,     0, ...,  2320,  1012,   102],
       [    0,     0,     0, ...,  6153,  8223,   102]])

In [244]:
# apply model to validation dataset
predictions = loaded_model.predict(X_test)

# Extract the logits and labels fr
pred_0_1 = np.array([[0 if pred<0.5 else 1] for pred in predictions])

print(len(pred_0_1))
print(len(y_test))
# Use your compute_metrics function
accuracy = accuracy_score(y_test, pred_0_1)
# metrics = compute_metrics((logits, labels))
print(accuracy)

# BERT {'Accuracy': 0.955, 'AUC': 0.989}
#DistilBERT {'Accuracy': 0.944, 'AUC': 0.994}

180
180
0.5777777777777777


In [245]:
confusion_matrix(pred_0_1, y_test)

array([[104,  76],
       [  0,   0]], dtype=int64)

In [124]:
def tokenizer_text_single(text):
    # return tokenized text with truncation
    ebmedding = [tokenizer(text, truncation=True)["input_ids"]]
    # print(x)
    return np.asarray(sequence.pad_sequences(ebmedding, maxlen=embedding_dimension, padding='post'))

In [132]:
sentence = "What developed from the mammalian odor pathways?"
sentence = "Could you please tell me the direction of the retaurant?"
sentence = "How do you know this?"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question[0][:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [133]:
res = loaded_model.predict([tokenized_question])
res



array([[0.9999062]], dtype=float32)

In [134]:
sentence = "Burke received a vote of thanks from the Commons for his services in the Hastings Trial and he immediately resigned his seat"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question[0][:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [136]:
res = loaded_model.predict([tokenized_question])
res



array([[0.00389195]], dtype=float32)

In [305]:
merged_df_test = merged_df.sample(frac=1).reset_index(drop=True)
len(merged_df_test)

156958

In [331]:
merged_df_test.head()

Unnamed: 0,text,labels,embeddings
0,"The following year, it chartered the Royal Ni...",0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,What type of data compression is the converse ...,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"The film was another huge box office smash, g...",0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,"Raised in Chicago, West briefly attended art s...",1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,it's a Super Saver return i think i need,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
# def preprocess_function_df(examples):
#     # return tokenized text with truncation
#     return tokenizer(examples["text"], truncation=True)["input_ids"]

In [389]:
def tokenizer_text_df(text):
    # return tokenized text with truncation
    ebmedding = tokenizer(text, truncation=True)["input_ids"]
    return ebmedding
    # print(x)

In [390]:
tokenized_data = merged_df_test["text"].apply(tokenizer_text_df)
# merged_df_test_embeddings = np.asarray(sequence.pad_sequences(tokenized_data, maxlen=embedding_dimension, padding='pre'))

ValueError: `sequences` must be a list of iterables. Found non-iterable: 101

In [391]:
tokenized_data

0         [101, 1996, 2206, 2095, 1010, 2009, 12443, 199...
1         [101, 2054, 2828, 1997, 2951, 13379, 2003, 199...
2         [101, 1996, 2143, 2001, 2178, 4121, 3482, 2436...
3         [101, 2992, 1999, 3190, 1010, 2225, 4780, 3230...
4         [101, 2009, 1005, 1055, 1037, 3565, 3828, 2099...
                                ...                        
156953    [101, 2574, 1010, 2474, 6633, 19968, 2063, 199...
156954    [101, 1996, 2744, 1000, 7680, 27549, 1000, 200...
156955    [101, 2043, 2106, 7072, 2991, 1999, 6956, 1997...
156956    [101, 1999, 3655, 2060, 2084, 4199, 1010, 1996...
156957    [101, 5709, 22975, 2546, 1997, 3019, 3806, 100...
Name: text, Length: 156958, dtype: object

In [401]:
padded_encoding_test = sequence.pad_sequences(tokenized_data, maxlen=embedding_dimension, padding='pre')

In [402]:
padded_encoding_test

array([[    0,     0,     0, ...,  2751,  2666,   102],
       [    0,     0,     0, ..., 11259,  8358,   102],
       [    0,     0,     0, ...,  4969,  1012,   102],
       ...,
       [    0,     0,     0, ...,  3354,  2586,   102],
       [    0,     0,     0, ...,  8170,  1029,   102],
       [    0,     0,     0, ...,  3088,  1012,   102]])

In [403]:
# merged_df_test["embedding1d"] = padded_encoding_test

In [404]:
merged_df_test.head()

Unnamed: 0,text,labels,embeddings,embedding1d
0,"The following year, it chartered the Royal Ni...",0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
1,What type of data compression is the converse ...,1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
2,"The film was another huge box office smash, g...",0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
3,"Raised in Chicago, West briefly attended art s...",1,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0
4,it's a Super Saver return i think i need,0,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0


In [407]:
padded_encoding_test_10k = padded_encoding_test[:10000]

In [408]:
len(padded_encoding_test_10k)

10000

In [410]:
padded_encoding_test_10k

array([[    0,     0,     0, ...,  2751,  2666,   102],
       [    0,     0,     0, ..., 11259,  8358,   102],
       [    0,     0,     0, ...,  4969,  1012,   102],
       ...,
       [    0,     0,     0, ...,  1996,  7095,   102],
       [    0,     0,     0, ...,  1996,  2479,   102],
       [    0,     0,     0, ...,  1997,  3163,   102]])

In [323]:
# print(np.asarray(merged_df_test_10k["embeddings"][0]).astype('float32'))

In [409]:
# print(X_test)

In [411]:
##### Time 
from time import time
start_time = time()
loaded_model.predict(padded_encoding_test_10k)
end_time = time()



In [412]:
time_diff = end_time - start_time
time_diff

21.372161388397217