# **TRADITIONAL MACHINE LEARNING APPROACH**

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve
# To ignore any warnings
import warnings
warnings.filterwarnings('ignore')
from imblearn.under_sampling import RandomUnderSampler

In [5]:
file_path = '/home/shlok/Documents/DDoS-Final-Year-Project/Machine Learning/dataset/Syn.csv'
#read the data in chunks to avoid memory errors
chunk_size = 100000
chunks = []
# Create an iterator to read the CSV in chunks
csv_iterator = pd.read_csv(file_path, chunksize=chunk_size, low_memory=False)
for chunk in csv_iterator:
    # Strip whitespace from column names
    chunk.columns = chunk.columns.str.strip()
    filtered_chunk = chunk[chunk['Label'].isin(['BENIGN', 'Syn'])]
    chunks.append(filtered_chunk)
# Concatenate all the filtered chunks into a single DataFrame
df = pd.concat(chunks, ignore_index=True)
print(f"Original dataset had millions of rows.")
print(f"After filtering, our new dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
print("\nDistribution of traffic types in the new dataset:")
print(df['Label'].value_counts())

Original dataset had millions of rows.
After filtering, our new dataset has 4320541 rows and 88 columns.

Distribution of traffic types in the new dataset:
Label
Syn       4284751
BENIGN      35790
Name: count, dtype: int64


In [6]:
# List of columns to keep for the model
selected_columns = [
    'Source Port', 'Destination Port', 'Protocol', 'Flow Duration',
    'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packet Length Mean',
    'Fwd Packet Length Std', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Min',
    'Fwd IAT Mean', 'Min Packet Length', 'Max Packet Length', 'Avg Fwd Segment Size',
    'SYN Flag Count', 'ACK Flag Count', 'PSH Flag Count', 'RST Flag Count', 'FIN Flag Count',
    'Label'
]
# Create a new DataFrame with only the selected columns
df_selected = df[selected_columns].copy()
print(f"DataFrame shape after selecting columns: {df_selected.shape}")
print("\nFirst 5 rows of the final dataset:")
df_selected.head()

DataFrame shape after selecting columns: (4320541, 21)

First 5 rows of the final dataset:


Unnamed: 0,Source Port,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packet Length Mean,Fwd Packet Length Std,Flow Packets/s,Flow IAT Mean,...,Fwd IAT Mean,Min Packet Length,Max Packet Length,Avg Fwd Segment Size,SYN Flag Count,ACK Flag Count,PSH Flag Count,RST Flag Count,FIN Flag Count,Label
0,9429,9429,6,36063894,7,2,6.0,0.0,0.2495571,4507987.0,...,6010649.0,6.0,6.0,6.0,0,1,0,0,0,Syn
1,60224,60224,6,44851366,8,4,6.0,0.0,0.2675504,4077397.0,...,6407331.0,6.0,6.0,6.0,0,1,0,0,0,Syn
2,11746,33827,6,1,2,0,6.0,0.0,2000000.0,1.0,...,1.0,6.0,6.0,6.0,1,0,0,0,0,Syn
3,33828,1431,6,0,2,0,6.0,0.0,inf,0.0,...,0.0,6.0,6.0,6.0,0,1,0,0,0,Syn
4,5311,5311,6,35731470,8,2,6.0,0.0,0.2798653,3970163.0,...,5104496.0,6.0,6.0,6.0,0,1,0,0,0,Syn


In [7]:
# Check for and handle infinite or missing values
# Replace infinite values with NaN (Not a Number)
df_selected.replace([np.inf, -np.inf], np.nan, inplace=True)
# Drop any rows that have NaN values
df_selected.dropna(inplace=True)
print(f"Shape after dropping NaN/infinite values: {df_selected.shape}")

Shape after dropping NaN/infinite values: (4037465, 21)


In [8]:
#Separate features (X) and the target variable (y)
X = df_selected.drop('Label', axis=1)
y = df_selected['Label']

#Encode the target variable 'y' into numbers (0 for BENIGN, 1 for Syn)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [1]:
# We use stratify=y_encoded to ensure both train and test sets have a similar proportion of attack vs benign traffic.
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
print(f"\nTraining set has {X_train.shape[0]} samples.")
print(f"Testing set has {X_test.shape[0]} samples.")

#This standardises the data to have a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

NameError: name 'train_test_split' is not defined

In [None]:
# Train the Random Forest model
print("Training the Random Forest model...")
# n_jobs=-1 uses all available CPU cores to speed up training
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)
print("Model training complete!")
#Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)

Training the Random Forest model...


In [None]:
#Evaluation of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\n --- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:
#Display the detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['BENIGN (0)', 'Syn (1)']))

In [None]:
#Visualise the Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['BENIGN', 'Syn'], yticklabels=['BENIGN', 'Syn'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# **TESTING VARIATIONS**

In [None]:
# Separate features (X) and target (y)
X = df_selected.drop('Label', axis=1)
y = df_selected['Label']
# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# The test set will be our imbalanced, realistic "hold-out" set
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
print("--- Before Balancing ---")
print(f"Training set shape: {X_train.shape}")
print(f"Distribution in training set:\n{pd.Series(y_train).value_counts()}")

In [None]:
# This will reduce the number of majority class ('Syn') samples to match the minority class ('BENIGN')
rus = RandomUnderSampler(random_state=42)
X_train_balanced, y_train_balanced = rus.fit_resample(X_train, y_train)
print("\n--- After Balancing ---")
print(f"New balanced training set shape: {X_train_balanced.shape}")
print(f"New distribution in training set:\n{pd.Series(y_train_balanced).value_counts()}")

In [None]:
# We fit the scaler ONLY on the training data to prevent data leakage from the test set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the model on the BALANCED data
print("Training the Random Forest model on BALANCED data...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train_balanced)
print("Model training complete!")

# Make predictions on the original, IMBALANCED test set
y_pred = rf_model.predict(X_test_scaled)

In [None]:
# A more critical evaluation of the results
print("\n--- Model Evaluation on Realistic (Imbalanced) Test Data ---")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['BENIGN (0)', 'Syn (1)']))
print("\n--- MENTOR'S ANALYSIS ---")
print("Notice the difference now. The overall accuracy might be slightly lower, but look at the 'BENIGN' row.")
print("Our goal was to improve the RECALL for the 'BENIGN' class, reducing the number of False Positives.")

In [None]:
# Visualise the new Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['BENIGN', 'Syn'], yticklabels=['BENIGN', 'Syn'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix on Realistic Test Data')
plt.show()

# Extracting the False Positive count for emphasis
false_positives = cm[0][1]
print(f"\nCRITICAL METRIC: The model misclassified {false_positives} BENIGN samples as attacks (False Positives).")
print("This is the number we want to minimize. Our previous model had a higher error rate for this specific case.")
print("By training on balanced data, the model has learned to better distinguish benign traffic, making it more reliable for real-world deployment.")

# **POLISHING VARIATIONS**

In [None]:
"""
The previous code for loading, splitting, and balancing the training data is the same. We'll start from the point where the model is already trained. Let's assume 'rf_model' is our trained model and 'X_test_scaled' is our test data. Get the predicted probabilities for the positive class ('Syn') predict_proba returns two columns: [prob_of_0, prob_of_1]. We only need the probabilities for class 1 ('Syn').
"""
y_pred_probs = rf_model.predict_proba(X_test_scaled)[:, 1]
print("Successfully retrieved prediction probabilities for each sample in the test set.")

In [None]:
# Calculate precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_probs)
# We add a small epsilon to avoid division by zero
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)

# Find the threshold that gives the best F1 score
best_f1_index = np.argmax(f1_scores)
best_threshold = thresholds[best_f1_index]
best_f1_score = f1_scores[best_f1_index]

print(f"Best F1-Score: {best_f1_score:.4f}")
print(f"Optimal Threshold: {best_threshold:.4f}")

In [None]:
# Plotting the curve
plt.figure(figsize=(10, 7))
plt.plot(thresholds, precision[:-1], 'b--', label='Precision')
plt.plot(thresholds, recall[:-1], 'g-', label='Recall')
plt.plot(thresholds, f1_scores[:-1], 'r-', lw=2, label='F1-Score')
plt.axvline(x=best_threshold, color='k', linestyle='--', label=f'Optimal Threshold ({best_threshold:.2f})')
plt.xlabel('Decision Threshold')
plt.ylabel('Score')
plt.title('Precision, Recall, and F1-Score vs. Decision Threshold')
plt.legend(loc='best')
plt.grid(True)
plt.show()

In [None]:
# Apply the optimal threshold to get the new predictions
y_pred_optimal = (y_pred_probs >= best_threshold).astype(int)
print(f"--- Final Model Evaluation with Optimal Threshold ({best_threshold:.4f}) ---")

In [None]:
print("\nNew Classification Report:")
print(classification_report(y_test, y_pred_optimal, target_names=['BENIGN (0)', 'Syn (1)']))

print("\n--- MENTOR'S FINAL ANALYSIS ---")
print("This is the balanced result we were looking for. We've accepted a tiny, calculated risk")
print("to build a much more reliable and practical security model.")

In [None]:
# Visualise the final Confusion Matrix
print("\nFinal Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred_optimal)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='YlOrRd',
            xticklabels=['BENIGN', 'Syn'], yticklabels=['BENIGN', 'Syn'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Final Confusion Matrix with Optimal Threshold')
plt.show()

false_positives = cm[0][1]
false_negatives = cm[1][0]

In [None]:
print(f"\nFinal Error Count:")
print(f"Benign traffic blocked (False Positives): {false_positives}")
print(f"Attack traffic missed (False Negatives): {false_negatives}")