In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
import pandas as pd
import tensorflow as tf
import os
import numpy as np

In [3]:
df = pd.read_csv('/content/drive/MyDrive/clean_data.csv')
#df.head()

In [4]:
# Partition the data into training and testing sets (80/20)
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=110122
)


In [5]:
# Extract Train Text
train_text = (
    train_df["text_clean"]
    .fillna("")        # replace NaN with empty string
    .astype(str)       # convert everything to string
    .tolist()
)

In [8]:
# Convert bclass to 0/1
train_labels = train_df["mclass"].astype("category").cat.codes
train_labels = train_labels - train_labels.min()
train_labels = train_labels.to_numpy()
num_classes = len(np.unique(train_labels))

In [9]:

# 1. Create the preprocessing layer
preprocess_layer = tf.keras.layers.TextVectorization(
    standardize=None,
    split="whitespace",
    ngrams=None,
    max_tokens=None,
    output_mode="tf_idf"
)

# 2. Adapt to the training text
preprocess_layer.adapt(train_text)


In [12]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential

# Build the model
model = Sequential([
    preprocess_layer,
    Dropout(0.3),
    Dense(50, activation='relu'),
    Dropout(0.3),
    Dense(25, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

# Show model summary
model.summary()

In [13]:
# 1. Compile the model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

train_text_tensor = tf.constant(train_text)
train_labels_tensor = tf.constant(train_labels)


# 2. Train the model
history = model.fit(
    x=train_text_tensor,
    y=train_labels_tensor,
    validation_split=0.3,
    epochs=5
)

# Predictive Accuracy
# Extract text and labels from the test DataFrame
test_text = (
    test_df["text_clean"]
    .fillna("")        # replace NaN with empty string
    .astype(str)       # convert everything to string
    .tolist()
)

test_labels = test_df["mclass"].astype("category").cat.codes
test_labels = test_labels - test_labels.min()
test_labels = test_labels.to_numpy()

# Convert to TensorFlow tensors
test_text_tensor = tf.constant(test_text)
test_labels_tensor = tf.constant(test_labels)

test_loss, test_accuracy = model.evaluate(test_text_tensor, test_labels_tensor)
print(f"Test accuracy: {test_accuracy:.4f}")


Epoch 1/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 57ms/step - accuracy: 0.3724 - loss: 1.6656 - val_accuracy: 0.7432 - val_loss: 1.0583
Epoch 2/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.6648 - loss: 1.2981 - val_accuracy: 0.7646 - val_loss: 0.9078
Epoch 3/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.7434 - loss: 0.8775 - val_accuracy: 0.7782 - val_loss: 0.9345
Epoch 4/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 72ms/step - accuracy: 0.7877 - loss: 0.8150 - val_accuracy: 0.7860 - val_loss: 0.8526
Epoch 5/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 67ms/step - accuracy: 0.8346 - loss: 0.7246 - val_accuracy: 0.7860 - val_loss: 0.8281
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7775 - loss: 1.0891
Test accuracy: 0.7850


In [14]:
# Make sure the directory exists
os.makedirs("results", exist_ok=True)

# Save the model
model.save("results/naira-multimodel.keras")