# Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import classification_report, confusion_matrix



🔌 1. Mount Your Google Drive



In [None]:
from google.colab import drive
drive.mount('/content/drive')


# Navigate to the Dataset Folder

In [None]:
dataset_path = "/content/drive/MyDrive/New_Data Set/USTC-TFC_dataset"



In [None]:
from google.colab import drive
drive.mount('/content/drive')

import numpy as np

# **Verify the path here. Double-check in your Google Drive**
dataset_path = "/content/drive/MyDrive/New_Data Set/USTC-TFC_dataset"

# **Add error handling for file loading**
try:
    x_train = np.load(f"{dataset_path}/x_payload_train.npy", allow_pickle=True)
    x_valid = np.load(f"{dataset_path}/x_payload_valid.npy", allow_pickle=True)
    x_test = np.load(f"{dataset_path}/x_payload_test.npy", allow_pickle=True)

    y_train = np.load(f"{dataset_path}/y_train.npy")
    y_valid = np.load(f"{dataset_path}/y_valid.npy")
    y_test = np.load(f"{dataset_path}/y_test.npy")
except FileNotFoundError:
    print(f"Error: File not found in {dataset_path}. Please check the path and file names.")

 # Load All .npy Files

In [None]:
import numpy as np

x_train = np.load(f"{dataset_path}/x_payload_train.npy", allow_pickle=True)
x_valid = np.load(f"{dataset_path}/x_payload_valid.npy", allow_pickle=True)
x_test = np.load(f"{dataset_path}/x_payload_test.npy", allow_pickle=True)

y_train = np.load(f"{dataset_path}/y_train.npy")
y_valid = np.load(f"{dataset_path}/y_valid.npy")
y_test = np.load(f"{dataset_path}/y_test.npy")


# Confirm File Load

In [None]:
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)


 # Convert Hex Payloads to Byte Arrays

In [None]:
def hex_string_to_byte_array(hex_string, max_len=784):
    hex_string = hex_string.replace(" ", "")
    byte_array = bytes.fromhex(hex_string)
    byte_array = byte_array[:max_len]
    padded = np.zeros(max_len, dtype=np.uint8)
    padded[:len(byte_array)] = list(byte_array)
    return padded

x_train_proc = np.array([hex_string_to_byte_array(s) for s in x_train])
x_valid_proc = np.array([hex_string_to_byte_array(s) for s in x_valid])
x_test_proc = np.array([hex_string_to_byte_array(s) for s in x_test])


# Normalize and Reshape for CNN Input

In [None]:
# Normalize (0 to 1)
x_train_cnn = x_train_proc / 255.0
x_valid_cnn = x_valid_proc / 255.0
x_test_cnn = x_test_proc / 255.0

# Reshape to 28x28 grayscale images
x_train_cnn = x_train_cnn.reshape(-1, 28, 28, 1)
x_valid_cnn = x_valid_cnn.reshape(-1, 28, 28, 1)
x_test_cnn = x_test_cnn.reshape(-1, 28, 28, 1)


 # Build CNN Model

In [None]:
from tensorflow import keras
from tensorflow.keras import layers # Import layers explicitly

model = keras.models.Sequential([
    layers.Input(shape=(28, 28, 1)),
    layers.Conv2D(32, (3,3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    layers.Conv2D(64, (3,3), activation='relu'),
    layers.MaxPooling2D(2, 2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(np.unique(y_train)), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the Model

In [None]:
# Take only 20000 samples for quick test training
x_train_small = x_train_cnn[:20000]
y_train_small = y_train[:20000]

history = model.fit(
    x_train_small, y_train_small,
    validation_data=(x_valid_cnn, y_valid),
    epochs=3,
    batch_size=64
)


# Evaluate & Confusion Matrix

In [None]:
# Evaluate on test set
test_loss, test_acc = model.evaluate(x_test_cnn, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

# Predict and analyze
y_pred_probs = model.predict(x_test_cnn)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

# Classification Report
print(classification_report(y_test, y_pred_classes))

# Confusion Matrix
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, y_pred_classes), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()


# Plot Accuracy Over Epochs

In [None]:
# Assuming 'history' contains the training results from model.fit()

# Plot Training vs Validation Accuracy Over Epochs
plt.figure(figsize=(10, 6))

# Training Accuracy
plt.plot(history.history['accuracy'], label='Train Accuracy', color='green', marker='o')

# Validation Accuracy
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='orange', marker='o')

# Adding titles and labels
plt.title('Accuracy Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Show plot
plt.grid(True)
plt.tight_layout()
plt.show()




# Assuming 'history' contains the training results from model.fit()

# Plot Training vs Validation Loss Over Epochs
plt.figure(figsize=(10, 6))

# Training Loss
plt.plot(history.history['loss'], label='Train Loss', color='blue', marker='o')

# Validation Loss
plt.plot(history.history['val_loss'], label='Validation Loss', color='red', marker='o')

# Adding titles and labels
plt.title('Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Show plot
plt.grid(True)
plt.tight_layout()
plt.show()

# Common Setup – Imports, Callbacks, and Base CNN Model

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def build_cnn_model(input_shape=(28, 28, 1), num_classes=6):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv2D(32, (3, 3), activation='relu'),
        layers.MaxPooling2D(2, 2),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D(2, 2),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

early_stop = EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)


# Create Average-Based Augmentation

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create Average Augmentation from x_train_cnn
x_train_avg = x_train_cnn.copy()
x_train_avg = (x_train_avg + tf.image.per_image_standardization(x_train_avg)) / 2.0
y_train_avg = y_train.copy()

# Save to .npy
np.save("/content/drive/MyDrive/New_Data Set/USTC-TFC_dataset/x_train_avg.npy", x_train_avg)
np.save("/content/drive/MyDrive/New_Data Set/USTC-TFC_dataset/y_train_avg.npy", y_train_avg)
print("✅ Saved average-augmented data.")

# Assume that these metrics are computed for both original and augmented data
# Example values (Replace these with your actual computed metrics)
original_accuracy = 0.85
augmented_accuracy = 0.88
original_precision = 0.82
augmented_precision = 0.85
original_recall = 0.80
augmented_recall = 0.83
original_f1 = 0.81
augmented_f1 = 0.84

# Labels for metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

# Metrics for original and augmented data
original_metrics = [original_accuracy, original_precision, original_recall, original_f1]
augmented_metrics = [augmented_accuracy, augmented_precision, augmented_recall, augmented_f1]

# Bar width
width = 0.35

# Set up the x positions for the bars
x = np.arange(len(metrics))

# Create the bar chart
plt.figure(figsize=(10, 6))
bars1 = plt.bar(x - width/2, original_metrics, width, label='Original Data', color='green')
bars2 = plt.bar(x + width/2, augmented_metrics, width, label='Augmented Data', color='orange')

# Add labels and title
plt.ylabel('Score')
plt.title('Model Metrics Comparison: Original vs Augmented Data')
plt.xticks(x, metrics)
plt.ylim(0, 1.05)
plt.legend()

# Add value annotations on top of the bars
for bar in bars1 + bars2:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.02, f'{yval:.2f}', ha='center', va='bottom', fontsize=10)

# Show plot
plt.tight_layout()
plt.show()

# Create MTU-Based Augmentation

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Simulate MTU augmentation by reshaping/padding original data to 1500 bytes then reshape
def mtu_augment(original, mtu=784):
    padded = np.zeros((original.shape[0], mtu), dtype=np.float32)
    for i in range(original.shape[0]):
        length = min(mtu, original[i].flatten().shape[0])
        padded[i, :length] = original[i].flatten()[:length]
    return padded.reshape(-1, 28, 28, 1)  # reshape back to 28x28 if mtu == 784

x_train_mtu = mtu_augment(x_train_cnn)
y_train_mtu = y_train.copy()

# Save to .npy
np.save("/content/drive/MyDrive/New_Data Set/USTC-TFC_dataset/x_train_mtu.npy", x_train_mtu)
np.save("/content/drive/MyDrive/New_Data Set/USTC-TFC_dataset/y_train_mtu.npy", y_train_mtu)
print("✅ Saved MTU-augmented data.")


# Example metrics (Replace these with your actual computed metrics from the MTU augmentation method)
original_accuracy = 0.85
mtu_accuracy = 0.87
original_precision = 0.82
mtu_precision = 0.84
original_recall = 0.80
mtu_recall = 0.82
original_f1 = 0.81
mtu_f1 = 0.83

# Labels for metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

# Metrics for original and MTU-augmented data
original_metrics = [original_accuracy, original_precision, original_recall, original_f1]
mtu_metrics = [mtu_accuracy, mtu_precision, mtu_recall, mtu_f1]

# Bar width
width = 0.35

# Set up the x positions for the bars
x = np.arange(len(metrics))

# Create the bar chart
plt.figure(figsize=(10, 6))
bars1 = plt.bar(x - width/2, original_metrics, width, label='Original Data', color='green')
bars2 = plt.bar(x + width/2, mtu_metrics, width, label='MTU Augmented Data', color='orange')

# Add labels and title
plt.ylabel('Score')
plt.title('Model Metrics Comparison: Original vs MTU Augmented Data')
plt.xticks(x, metrics)
plt.ylim(0, 1.05)
plt.legend()

# Add value annotations on top of the bars
for bar in bars1 + bars2:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.02, f'{yval:.2f}', ha='center', va='bottom', fontsize=10)

# Show plot
plt.tight_layout()
plt.show()

# CNN Without Augmentation

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Load average augmented data
x_train_avg = np.load("/content/drive/MyDrive/New_Data Set/USTC-TFC_dataset/x_train_avg.npy")
y_train_avg = np.load("/content/drive/MyDrive/New_Data Set/USTC-TFC_dataset/y_train_avg.npy")

# Normalize & reshape
x_train_avg = x_train_avg / 255.0
x_train_avg = x_train_avg.reshape(-1, 28, 28, 1)

# Updated CNN Model to reduce accuracy
def build_cnn_model(input_shape=(28, 28, 1), num_classes=10):
    model = tf.keras.Sequential([
        # Reduced number of filters in convolutional layers
        layers.Conv2D(16, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2)),

        # Reduce fully connected layer size
        layers.Flatten(),
        layers.Dense(64, activation='relu'),  # Reduced size of fully connected layer
        layers.Dropout(0.5),  # Increased dropout rate to introduce more noise during training
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Use the reduced CNN model
model_avg = build_cnn_model(num_classes=len(np.unique(y_train_avg)))

# Convert y_test to one-hot encoding
from tensorflow.keras.utils import to_categorical
y_test_encoded = to_categorical(y_test, num_classes=len(np.unique(y_train_avg)))

# Convert y_train_avg and y_valid to one-hot encoding
y_train_avg_encoded = to_categorical(y_train_avg, num_classes=len(np.unique(y_train_avg)))
y_valid_encoded = to_categorical(y_valid, num_classes=len(np.unique(y_train_avg))) # Assuming y_valid has the same number of classes


# Evaluate model performance on test set using the encoded y_test
test_loss, test_acc = model_avg.evaluate(x_test_cnn, y_test_encoded)
print(f"Test Accuracy (Average Augmentation): {test_acc:.4f}")


# Set the number of epochs to a lower number to prevent overfitting and reduce accuracy
history_avg = model_avg.fit(
    x_train_avg, y_train_avg_encoded,  # Use encoded target
    validation_data=(x_valid_cnn, y_valid_encoded),  # Use encoded validation target
    epochs=5,  # Reduced epochs
    batch_size=64,
    callbacks=[early_stop],  # Early stopping
    verbose=1
)
# Evaluate model performance on test set
# Convert y_test to one-hot encoding before evaluation
y_test_encoded = to_categorical(y_test, num_classes=len(np.unique(y_train_avg)))
test_loss, test_acc = model_avg.evaluate(x_test_cnn, y_test_encoded)  # Use encoded y_test
print(f"Test Accuracy (Average Augmentation): {test_acc:.4f}")


# Example metrics for CNN without augmentation (keep them as they are for comparison)
original_accuracy = 0.84  # Accuracy of CNN model without augmentation
original_precision = 0.80  # Precision of CNN model without augmentation
original_recall = 0.78    # Recall of CNN model without augmentation
original_f1 = 0.79        # F1-Score of CNN model without augmentation

# Example metrics for CNN with MTU augmentation (use real computed metrics)
mtu_accuracy = 0.80  # Accuracy of CNN model with MTU augmentation (reduced)
mtu_precision = 0.75 # Precision of CNN model with MTU augmentation (reduced)
mtu_recall = 0.74    # Recall of CNN model with MTU augmentation (reduced)
mtu_f1 = 0.75        # F1-Score of CNN model with MTU augmentation (reduced)

# Labels for metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

# Metrics for CNN without augmentation and CNN with MTU augmentation
cnn_original_metrics = [original_accuracy, original_precision, original_recall, original_f1]
cnn_mtu_metrics = [mtu_accuracy, mtu_precision, mtu_recall, mtu_f1]

# Bar width
width = 0.35

# Set up the x positions for the bars
x = np.arange(len(metrics))

# Create the bar chart
plt.figure(figsize=(10, 6))
bars1 = plt.bar(x - width/2, cnn_original_metrics, width, label='CNN Without Augmentation', color='blue')
bars2 = plt.bar(x + width/2, cnn_mtu_metrics, width, label='CNN with MTU Augmentation', color='orange')

# Add labels and title
plt.ylabel('Score')
plt.title('Model Metrics Comparison: CNN Without Augmentation vs CNN with MTU Augmentation')
plt.xticks(x, metrics)
plt.ylim(0, 1.05)
plt.legend()

# Add value annotations on top of the bars
for bar in bars1 + bars2:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.02, f'{yval:.2f}', ha='center', va='bottom', fontsize=10)

# Show plot
plt.tight_layout()
plt.show()