In [2]:
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
# Loading dataset
data = pd.read_csv("MalwareData.csv", delimiter='|')

# Feature selection and target variable
X = data.iloc[:, 2:-1].values  # All numeric features excluding 'Name', 'md5', and target
y = data['legitimate'].values  # Target variable

# Normalizing features
X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))

# Splitting data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [6]:
# Building model
model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Input layer
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [8]:
# Training model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

# Evaluating model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Epoch 1/10
[1m3452/3452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 777us/step - accuracy: 0.9419 - loss: 0.1428 - val_accuracy: 0.9838 - val_loss: 0.0492
Epoch 2/10
[1m3452/3452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 764us/step - accuracy: 0.9850 - loss: 0.0485 - val_accuracy: 0.9850 - val_loss: 0.0507
Epoch 3/10
[1m3452/3452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 819us/step - accuracy: 0.9863 - loss: 0.0450 - val_accuracy: 0.9875 - val_loss: 0.0428
Epoch 4/10
[1m3452/3452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 759us/step - accuracy: 0.9869 - loss: 0.0424 - val_accuracy: 0.9876 - val_loss: 0.0423
Epoch 5/10
[1m3452/3452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 739us/step - accuracy: 0.9872 - loss: 0.0410 - val_accuracy: 0.9878 - val_loss: 0.0400
Epoch 6/10
[1m3452/3452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 763us/step - accuracy: 0.9867 - loss: 0.0409 - val_accuracy: 0.9878 - val_loss: 0.0398
Epoc

In [10]:
# Predictions and confusion matrix
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 677us/step
[[9564  108]
 [  50 4083]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      9672
           1       0.97      0.99      0.98      4133

    accuracy                           0.99     13805
   macro avg       0.98      0.99      0.99     13805
weighted avg       0.99      0.99      0.99     13805



In [12]:
# Save model
model.save("ScanMyPC_Training.h5")
print("Model saved as ScanMyPC_Training.h5")



Model saved as ScanMyPC_Training.h5
