In [None]:
!mkdir Datasets
!mv test.csv Datasets/test_set.csv
!mv train.csv Datasets/train_set.csv
!mv val.csv Datasets/validation_set.csv

# Fine-tune CNN

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense 
from tensorflow.python.client import device_lib 
import tensorflow as tf
import pandas as pd 
import time
import random
import os

 # For replication purposes
tf.random.set_seed(420)
tf.config.experimental.enable_op_determinism()
random.seed(0)

class CNNTraining:
    def __init__(self, learning_rate, epochs, batch_size, max_len, feature_col, label_col):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.max_len = max_len
        self.history = None

    def load_data(self, train_file_path, val_file_path):
        train_df = pd.read_csv(train_file_path)
        val_df = pd.read_csv(val_file_path)

        # Extracting text and labels from training data
        self.train_texts = train_df[feature_col].tolist()
        self.train_labels = train_df[label_col].values

        # Extracting text and labels from validation data
        self.val_texts = val_df[feature_col].tolist()
        self.val_labels = val_df[label_col].values

    def preprocess_data(self):
        self.tokenizer = Tokenizer() 
        self.tokenizer.fit_on_texts(self.train_texts) 

        # Converting text data to sequences
        train_sequences = self.tokenizer.texts_to_sequences(self.train_texts)
        val_sequences = self.tokenizer.texts_to_sequences(self.val_texts)

        # Padding sequences to a fixed length
        self.train_data = pad_sequences(train_sequences, maxlen=self.max_len, padding='post')
        self.val_data = pad_sequences(val_sequences, maxlen=self.max_len, padding='post')

    def build_model(self):
        self.model = Sequential() 
        self.model.add(Embedding(len(self.tokenizer.word_index) + 1, 128, input_length=self.max_len)) 
        self.model.add(Conv1D(128, 5, activation='relu')) 
        self.model.add(GlobalMaxPooling1D()) 
        self.model.add(Dense(64, activation='relu')) 
        self.model.add(Dense(1, activation='sigmoid')) 

        optimizer = Adam(learning_rate=self.learning_rate) 
        self.model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) 

    def evaluate_model(self):
      loss, accuracy = self.model.evaluate(self.val_data, self.val_labels, verbose=0)
      return accuracy

    def train_model(self):
        self.history = self.model.fit(self.train_data, self.train_labels, epochs=self.epochs, batch_size=self.batch_size, validation_data=(self.val_data, self.val_labels)) # Training the model

    def get_training_loss(self):
        return self.history.history['loss']

    def get_validation_loss(self):
        return self.history.history['val_loss']

    def get_validation_accuracy(self):
        return self.history.history['val_accuracy']

    def save_model(self, save_dir, model_name):
      os.makedirs(save_dir, exist_ok=True) 
      self.model.save(os.path.join(save_dir, model_name + '.keras')) 

# Usage
start_time = time.time()
model = 'cnn'

# Hyperparameters
learning_rate = 2e-5
epochs = 3
batch_size = 6
max_len = 4096
optimizer = 'Adam'

# Paths and filenames
absolute_path = "./"
train_file_path = 'Datasets/train_set.csv' 
val_file_path = 'Datasets/validation_set.csv'
save_dir = 'TrainedModels/'
trained_model = model + '_optimizer_' + optimizer + '_lr_' + str(learning_rate) + '_epochs_' + str(epochs) + '_bs_' + str(batch_size) + '_maxlen_' + str(max_len)
feature_col = 'text' 
label_col = 'label'

# Training and saving a CNN model for spam classification
trainer = CNNTraining(learning_rate, epochs, batch_size, max_len, feature_col, label_col) 
trainer.load_data(absolute_path + train_file_path, absolute_path + val_file_path) 
trainer.preprocess_data() 
trainer.build_model()
trainer.train_model() 
trainer.save_model(absolute_path + save_dir, trained_model) 

# Time
training_time = time.time() - start_time
inference_start_time = time.time()
validation_accuracy = trainer.evaluate_model()
inference_time = time.time() - inference_start_time

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10627509576847154832
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 15510929408
locality {
  bus_id: 1
  links {
  }
}
incarnation: 14326477267733473123
physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0"
xla_global_id: 416903419
]
Mounted at /content/gdrive
Epoch 1/3
Epoch 2/3
Epoch 3/3
Training Loss: [0.5288362503051758, 0.38770896196365356, 0.3785526752471924]
Validation Loss: [0.4034520387649536, 0.3853952884674072, 0.37890899181365967]
Validation Accuracy: [0.8653198480606079, 0.8653198480606079, 0.8653198480606079]
Training time: 65.40 seconds


# Predict using the fine-tuned RoBERTa

In [None]:
from tensorflow.keras.models import load_model

class CNNPredictions:
    def __init__(self, max_len, absolute_path, test_file_path, predictions_path, trained_model, feature_col, prediction_col):
        self.max_len = max_len
        self.test_file_path = test_file_path
        self.predictions_path = predictions_path
        self.absolute_path = absolute_path
        self.trained_model = trained_model
        self.feature_col = feature_col
        self.prediction_col = prediction_col

    def predict(self):
      # Load test dataset
      test_df = pd.read_csv(self.absolute_path + self.test_file_path)
      test_texts = test_df[self.feature_col].tolist() 

      # Tokenize text data using the same tokenizer used during training
      tokenizer = Tokenizer()  
      tokenizer.fit_on_texts(test_texts)  
      test_sequences = tokenizer.texts_to_sequences(test_texts) 

      test_data = pad_sequences(test_sequences, maxlen=self.max_len, padding='post')  

      saved_model = load_model(self.absolute_path + self.trained_model)  

      # Make predictions on test data using the loaded model
      predictions = saved_model.predict(test_data)  
      binary_predictions = (predictions > 0.5).astype(int)
      test_df[self.prediction_col] = binary_predictions
      test_df.to_csv(self.absolute_path + self.predictions_path, index=False)

      print("Predictions done")

max_len = 4096
str_params = 'cnn_optimizer_Adam_lr_2e-05_epochs_3_bs_6_maxlen_4096'

# Paths and filenames
absolute_path = "./"
test_file_path = 'Datasets/test_set.csv' 
predictions_path = 'Datasets/test_set.csv' 
trained_model = 'TrainedModels/' + str_params + '.keras'
feature_col = 'text'
prediction_col = str_params + '_prediction'

# Instantiate the CNNPredictions class
cnn_predictions = CNNPredictions(max_len, absolute_path, test_file_path, predictions_path, trained_model, feature_col, prediction_col)

# Perform predictions
cnn_predictions.predict()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Predictions done


# Metrics and results

In [None]:
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    precision_score,
    recall_score,
    matthews_corrcoef,
    roc_auc_score,
    average_precision_score,
    confusion_matrix
)
import numpy as np
import pandas as pd

# Metrics 
def false_positive_rate(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp / (fp + tn)

SCORING = {
    "F1": f1_score,
    "Accuracy": accuracy_score,
    "Precision": precision_score,
    "Recall": recall_score,
    "MCC": matthews_corrcoef,
    "ROC AUC": roc_auc_score,
    "PRC AREA": average_precision_score,
    "FPR": false_positive_rate, # Custom FPR function
}

# Load predictions
predictions_df = pd.read_csv("Datasets/test_set.csv")
metric_values = {}

# Evaluate metrics
for metric_name, metric_func in SCORING.items():
    if metric_name in ["PRC AREA", "FPR", "ROC AUC"]:
        metric_values[metric_name] = metric_func(predictions_df["label"], predictions_df[prediction_col])
    else:
        metric_values[metric_name] = metric_func(predictions_df["label"], predictions_df[prediction_col])

# Time
metric_values["training_time"] = training_time
metric_values["inference_time"] = inference_time

columns = list(SCORING.keys()) + ["training_time", "inference_time"]
scores = pd.DataFrame(columns=columns)

row = {}
for metric in SCORING.keys():
    val = metric_values[metric]
    row[metric] = round(val, 4) if isinstance(val, (float, int)) else val 

row["training_time"] = round(metric_values.get("training_time", 0), 4)
row["inference_time"] = round(metric_values.get("inference_time", 0), 4)

scores.loc["CNN"] = row

print(scores)