## Import The Libraries

In [1]:
import pickle
import scipy
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

## Import The Dataset And Split it Into Training, Validation, And Testing Sets

In [3]:
dataset = pd.read_csv('dataset.csv', index_col=False)[:20000]

# Split the dataset into train, and test sets (e.g., 80% train, 20% test)
x_train_full, x_test, y_train_full, y_test = train_test_split(dataset['Query'], dataset['Label'], test_size=0.2, stratify=dataset['Label'], random_state=42)

# Further split the training and validation sets (e.g., 75% training, 25% validation for inner set)
x_train, x_val, y_train, y_val = train_test_split(x_train_full, y_train_full, test_size=0.25, stratify=y_train_full, random_state=42)

## Data Preprocessing

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

# Learn vocabulary from training set (Discard The Validation Set) whilst vectorize training set.
x_train = vectorizer.fit_transform(x_train)
x_val = vectorizer.transform(x_val) # Vectorize validation set.
x_test = vectorizer.transform(x_test) # Vectorize test set.

In [7]:
# Because TfidfVectorizer returns sparse matrix for efficient memory storage, we need to convert it to normal dense matrix before we feed it into the neural network

x_train = scipy.sparse.csr_matrix.todense(x_train)
x_val = scipy.sparse.csr_matrix.todense(x_val)
x_test = scipy.sparse.csr_matrix.todense(x_test)

## Build The Model

In [10]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(x_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Train The Model

In [12]:
%%time
history = model.fit(x_train, y_train, epochs=2, validation_data=(x_val, y_val), verbose=1)

Epoch 1/2
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.9955 - loss: 0.0134 - val_accuracy: 0.9835 - val_loss: 0.1012
Epoch 2/2
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9970 - loss: 0.0077 - val_accuracy: 0.7057 - val_loss: 0.3704
CPU times: total: 7.39 s
Wall time: 10.3 s


## Evaluate The Model

In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix

# inversed_transformed_x_test = vectorizer.inverse_transform(np.asarray(x_test))
predictions = model.predict(x_test).round() # Customize This
model_evaluation = model.evaluate(x_test, y_test)
accuracy = accuracy_score(y_test, predictions)
confusion_matrix = confusion_matrix(y_test, predictions)

print("{}: {}%".format("The Accuracy", accuracy * 100))
print(confusion_matrix)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7007 - loss: 0.3984
The Accuracy: 69.375%
[[ 511 1213]
 [  12 2264]]


## Save The Model

In [None]:
# Save model to a .keras file
model.save("./assets/model.keras")

# Save the vectorizer to a file
with open("./assets/vectorizer.pickle", "wb") as f:
    pickle.dump(vectorizer, f)