In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import pandas as pd
import tensorflow as tf
import os

In [5]:
df = pd.read_csv('/content/drive/MyDrive/clean_data.csv')
#df.head()

In [6]:
# Partition the data into training and testing sets (80/20)
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=110122
)


In [7]:
# Extract Train Text
train_text = (
    train_df["text_clean"]
    .fillna("")        # replace NaN with empty string
    .astype(str)       # convert everything to string
    .tolist()
)

In [8]:
# Convert bclass to 0/1
train_labels = train_df["bclass"].astype("category").cat.codes
train_labels = train_labels - train_labels.min()
train_labels = train_labels.to_numpy()

In [9]:

# 1. Create the preprocessing layer
preprocess_layer = tf.keras.layers.TextVectorization(
    standardize=None,
    split="whitespace",
    ngrams=None,
    max_tokens=None,
    output_mode="tf_idf"
)

# 2. Adapt to the training text
preprocess_layer.adapt(train_text)


In [37]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential

# Build the model
model = Sequential([
    preprocess_layer,          # your TF-IDF vectorization layer
    Dropout(0.3),              # input dropout
    Dense(25, activation='gelu'), # hidden dense layer with 64 units
    Dropout(0.3), # dropout after hidden layer
    Dense(1, activation='sigmoid')  # output layer for binary classification
])

# Show model summary
model.summary()

In [38]:
# 1. Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['binary_accuracy']
)

train_text_tensor = tf.constant(train_text)
train_labels_tensor = tf.constant(train_labels)


# 2. Train the model
history = model.fit(
    x=train_text_tensor,
    y=train_labels_tensor,
    validation_split=0.3,
    epochs=5
)

# Predictive Accuracy
# Extract text and labels from the test DataFrame
test_text = (
    test_df["text_clean"]
    .fillna("")        # replace NaN with empty string
    .astype(str)       # convert everything to string
    .tolist()
)

test_labels = test_df["bclass"].astype("category").cat.codes
test_labels = test_labels - test_labels.min()
test_labels = test_labels.to_numpy()

# Convert to TensorFlow tensors
test_text_tensor = tf.constant(test_text)
test_labels_tensor = tf.constant(test_labels)

test_loss, test_accuracy = model.evaluate(test_text_tensor, test_labels_tensor)
print(f"Test accuracy: {test_accuracy:.4f}")


Epoch 1/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - binary_accuracy: 0.6436 - loss: 0.7459 - val_binary_accuracy: 0.7957 - val_loss: 0.5923
Epoch 2/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - binary_accuracy: 0.8804 - loss: 0.3671 - val_binary_accuracy: 0.8210 - val_loss: 0.5158
Epoch 3/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - binary_accuracy: 0.9130 - loss: 0.2771 - val_binary_accuracy: 0.8268 - val_loss: 0.5412
Epoch 4/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - binary_accuracy: 0.9232 - loss: 0.2042 - val_binary_accuracy: 0.8249 - val_loss: 0.5851
Epoch 5/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - binary_accuracy: 0.9214 - loss: 0.2237 - val_binary_accuracy: 0.8113 - val_loss: 0.6992
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - binary_accuracy: 0.8258 - loss: 1.1369
Test accuracy: 

In [39]:
# Make sure the directory exists
os.makedirs("results", exist_ok=True)

# Save the model
model.save("results/naira-model.keras")