In [97]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
import pandas as pd
from datasets import load_dataset

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pickle

In [3]:
# import tensorflow as tf
# tf.config.list_physical_devices('GPU')

In [4]:
# import torch
# device = "cuda" if torch.cuda.is_available() else "cpu"
# device

In [95]:
# dataset_dict = load_dataset("shawhin/phishing-site-classification")
dataset_file = "C:\\Users\\Spandan\\Downloads\\Compressed\\Sentence Types - Question, Command and Statement\\Sentence Types - Question, Command and Statement.csv"
model_export_path = "D:\PROJECTS\TensorFlow Model Exports\LSTM Simple Question Detection\\lstm_model_768.h5"
lr = 2e-4
batch_size = 8
num_epochs = 5
dataset_size = 1000

# define pre-trained model path
model_path = "distilbert/distilbert-base-uncased"

In [6]:
df_raw = pd.read_csv(dataset_file)

In [7]:
df_raw.columns = ["text", "labels"]

In [8]:
df_raw.labels.unique()

array(['command', 'statement', 'question'], dtype=object)

In [9]:
df_raw.labels.value_counts()

labels
question     130655
statement     78479
command         932
Name: count, dtype: int64

In [10]:
df = df_raw.loc[df_raw["labels"] != "command"]

In [11]:
len(df)

209134

In [12]:
df.labels.value_counts()

labels
question     130655
statement     78479
Name: count, dtype: int64

In [13]:
df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["labels"] = df["labels"].map(lambda typ: 0 if typ=="statement" else 1)


In [14]:
df.head()

Unnamed: 0,text,labels
1,it's from Birmingham to em London Euston please,0
2,the 8th of October,0
3,i'd like to leave on the 7:33 train,0
4,there's the 7:33 from Birmingham New Street,0
5,i'm just going to check to see what's your che...,0


In [15]:
question_df = df.loc[df["labels"] == 1]
statement_df = df.loc[df["labels"] == 0]

In [16]:
min_size = min(len(question_df), len(statement_df))
min_size

78479

In [17]:
question_df = question_df.iloc[:min_size]
statement_df = statement_df.iloc[:min_size]

In [18]:
print(len(question_df))
print(len(statement_df))

78479
78479


In [19]:
merged_df = pd.concat([question_df, statement_df], axis = 0)
merged_df.head()

Unnamed: 0,text,labels
42,is that Birmingham New Street,1
43,do you hold a current debit or credit card,1
44,do you have a rail card,1
45,would you like smoking or non-smoking,1
46,and do you have any seat preference,1


In [20]:
print(question_df.shape)
print(merged_df.shape)

(78479, 2)
(156958, 2)


In [21]:
merged_df = merged_df.sample(frac=1).reset_index(drop=True)

In [22]:
merged_df.labels.value_counts()

labels
0    78479
1    78479
Name: count, dtype: int64

In [23]:
df_small = merged_df.iloc[:dataset_size]
df_small = df_small.reset_index(drop=True)

In [24]:
df_small.labels.value_counts()

labels
1    529
0    471
Name: count, dtype: int64

In [25]:
len(df_small)

1000

In [26]:
df_small.head()

Unnamed: 0,text,labels
0,", or WUSTL) is a private research university l...",0
1,What does SDTV stand for?,1
2,What type of moraines are formed at the begini...,1
3,There was a constant power struggle between th...,1
4,Two groups of invertebrates have notably compl...,0


In [27]:
dataset = Dataset.from_pandas(df_small).train_test_split(test_size=0.10)
# dataset_train_and_validation = dataset["train"].train_test_split(test_size=0.10)

In [52]:
dataset_dict = DatasetDict({"train": dataset["train"], "validation": dataset["test"]})

In [53]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 100
    })
})

In [63]:
dataset_dict["train"].to_pandas().labels.value_counts()

labels
1    480
0    420
Name: count, dtype: int64

In [64]:
# dataset_dict["test"].to_pandas().labels.value_counts()

In [65]:
dataset_dict["train"][:10]

{'text': ["Marvel held its own comic book convention, Marvelcon '75, in spring 1975, and promised a Marvelcon '76. What marvel character was announced at the first Marvelcon",
  ' In addition, the club reached the final of the 1999–2000 UEFA Cup (losing on penalties to Galatasaray), were victorious in the 2003 and 2005 FA Cups, and won the Premier League in 2003–04 without losing a single match, an achievement which earned the side the nickname "The Invincibles"',
  'Following the crusades which country was dominated',
  'When did Miss America Sharlene Johnson graduate?',
  'What control can be used while targeting that allows the player to forego manual targeting',
  "In 1718, at the behest of either Rector Samuel Andrew or the colony's Governor Gurdon Saltonstall, Cotton Mather contacted a successful businessman named Elihu Yale, who lived in Wales but had been born in Boston and whose father, David, had been one of the original settlers in New Haven, to ask him for financial help in

In [91]:
# load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
vocabulary_size = 30522
embedding_dimension = 768
# min_len = 3

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dimension, input_length=embedding_dimension))
model.add(LSTM(100)) #Can be same as embedding_dimension, but model will large and inefficient
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# load model with binary classification head

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 768, 768)          23440896  
                                                                 
 lstm_3 (LSTM)               (None, 100)               347600    
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 23,788,597
Trainable params: 23,788,597
Non-trainable params: 0
_________________________________________________________________
None


In [67]:
# model.base_model

In [68]:
# [name for name, param in model.base_model.named_parameters()]

In [69]:
# define text preprocessing
def preprocess_function(examples):
    # return tokenized text with truncation
    return tokenizer(examples["text"], truncation=True)
    # return tokenizer(examples["text"], truncation=False)

#padded_dataset = sequence.pad_sequences(X_train, maxlen=min_len)
# preprocess all datasets
tokenized_data_raw = dataset_dict.map(preprocess_function, batched=True)


Map: 100%|██████████████████████████████████████████████████████████████████| 900/900 [00:00<00:00, 5521.99 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 2565.53 examples/s]


In [70]:
tokenized_data_raw

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 900
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [71]:
df_train = tokenized_data_raw["train"].to_pandas()
df_train.head()

Unnamed: 0,text,labels,input_ids,attention_mask
0,"Marvel held its own comic book convention, Mar...",1,"[101, 8348, 2218, 2049, 2219, 5021, 2338, 4680...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,"In addition, the club reached the final of th...",0,"[101, 1999, 2804, 1010, 1996, 2252, 2584, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Following the crusades which country was domin...,1,"[101, 2206, 1996, 16282, 2015, 2029, 2406, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,When did Miss America Sharlene Johnson graduate?,1,"[101, 2043, 2106, 3335, 2637, 21146, 20927, 26...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,What control can be used while targeting that ...,1,"[101, 2054, 2491, 2064, 2022, 2109, 2096, 1412...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [74]:
train_data = np.asarray(sequence.pad_sequences(df_train["input_ids"], maxlen=embedding_dimension))
target_data = np.asarray(df_train["labels"])

In [77]:
# target_data

In [78]:
# def pad_tokens(collection__):
#     # return [sequence.pad_sequences([x["input_ids"]], maxlen=embedding_dimension, padding='post')[0] for x in collection__]
#     return [sequence.pad_sequences([x["input_ids"]], maxlen=embedding_dimension, padding='post')[0] for x in collection__]
# def pad_collection(coll):
#     padded_tokens = {}
#     for key, v in tokenized_data.items():
#         padded_tokens[key] = []
#         # print(key)
#         mapped = pad_tokens(v)
#         for m in mapped:
#             # print(m)
#             padded_tokens[key].append(m)
#     return padded_tokens
# padded = pad_tokens(tokenized_data_raw["train"][0]["input_ids"])
# padded = pad_tokens(tokenized_data_raw["train"])
# padded
    # for x in v:
    #     print(x["input_ids"])
    #     print(sequence.pad_sequences([x["input_ids"]], maxlen=embedding_dimension, padding='post'))
    # print(sequence.pad_sequences([v[0]["input_ids"]], maxlen=embedding_dimension, padding='post'))
# tokenized_data_padded = tokenized_data.map(lambda ds: sequence.pad_sequences(ds["input_ids"], maxlen=embedding_dimension, padding='post'))
# tokenized_data_padded = tokenized_data.map(pad_tokens)
# tokenized_data_padded
# padded_tokens

In [79]:
# tokenized_data_padded = pad_collection(tokenized_data_raw)


In [80]:
# tokenized_data_raw["test"]["labels"]

In [81]:
# tokenized_data_padded = DatasetDict(tokenized_data_padded)
# tokenized_data_padded

In [82]:
# training_data = tokenized_data_padded["train"]
# testing_data = tokenized_data_padded["test"]

In [83]:
# len(training_data)

In [84]:
# testing_data[0][:20]

In [85]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred
    
    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, 
                                                                 keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, 
                                     references=labels)['roc_auc'],3)
    
    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, 
                                     references=labels)['accuracy'],3)
    
    return {"Accuracy": acc, "AUC": auc}

In [87]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target_data, test_size=0.2, random_state=2024)

In [89]:
X_train[:10]

array([[    0,     0,     0, ..., 13142,  1012,   102],
       [    0,     0,     0, ...,  2185,  1029,   102],
       [    0,     0,     0, ...,  2012, 10064,   102],
       ...,
       [    0,     0,     0, ...,  2103,  6537,   102],
       [    0,     0,     0, ...,  1999,  4885,   102],
       [    0,     0,     0, ...,  3956,  1029,   102]])

In [92]:
model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x180ce541ba0>

In [93]:
def save_model(model, model_location):
    save_classifier = open(model_location, "wb")
    pickle.dump(model, save_classifier)
    save_classifier.close()

def load_model(model_location):
    loaded_model = load_model(model_location)
    return loaded_model

In [98]:
save_model(model, model_export_path)



INFO:tensorflow:Assets written to: ram://845cbd35-be4d-4e9e-b787-64376455d077/assets


INFO:tensorflow:Assets written to: ram://845cbd35-be4d-4e9e-b787-64376455d077/assets


In [71]:
# apply model to validation dataset
predictions = model.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

# BERT {'Accuracy': 0.955, 'AUC': 0.989}
#DistilBERT {'Accuracy': 0.944, 'AUC': 0.994}

{'Accuracy': 0.944, 'AUC': 0.994}


In [102]:
predictions = model.predict(X_test)
predictions[0]



array([0.05179006], dtype=float32)

In [100]:
# apply model to validation dataset
predictions = model.predict(X_test)

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

# >> {'Accuracy': 0.889, 'AUC': 0.946}



AttributeError: 'numpy.ndarray' object has no attribute 'predictions'

In [74]:
def tokenizer_text_single(text):
    # return tokenized text with truncation
    return tokenizer(text, truncation=True)

In [81]:
sentence = "What developed from the mammalian odor pathways?"
# sentence = "Could you please tell me the direction of the retaurant?"
# sentence = "How do you know this?"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question

{'input_ids': [101, 2054, 2764, 2013, 1996, 26524, 19255, 16910, 1029, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [82]:
res = trainer2.predict([tokenized_question])
res

PredictionOutput(predictions=array([[-2.9417567,  3.1534703]], dtype=float32), label_ids=None, metrics={'test_model_preparation_time': 0.0017, 'test_runtime': 0.0693, 'test_samples_per_second': 14.435, 'test_steps_per_second': 14.435})

In [77]:
sentence = "Burke received a vote of thanks from the Commons for his services in the Hastings Trial and he immediately resigned his seat"
tokenized_question = tokenizer_text_single(sentence)
tokenized_question

{'input_ids': [101, 9894, 2363, 1037, 3789, 1997, 4283, 2013, 1996, 7674, 2005, 2010, 2578, 1999, 1996, 12296, 3979, 1998, 2002, 3202, 5295, 2010, 2835, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [78]:
res = trainer2.predict([tokenized_question])
res

PredictionOutput(predictions=array([[ 1.7417043, -1.3585452]], dtype=float32), label_ids=None, metrics={'test_model_preparation_time': 0.0017, 'test_runtime': 0.0852, 'test_samples_per_second': 11.737, 'test_steps_per_second': 11.737})