<a href="https://colab.research.google.com/github/samer-glitch/samerelhajjhassan/blob/main/dataset30allfeatures.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()




Saving dataset30allfeatures.csv to dataset30allfeatures.csv


In [2]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE

# Start the timer
start_time = datetime.datetime.now()

# Load the dataset
df = pd.read_csv('dataset30allfeatures.csv')
print("Dataset Loaded - Time:", datetime.datetime.now() - start_time)

def label_traffic(row):
    # Define hypothetical thresholds for normal behavior
    normal_port_range = range(0, 49152)  # Common port range
    normal_protocols = [6, 17]  # TCP and UDP
    normal_pkt_count_range = range(0, 2000)
    normal_octet_count_range = range(0, 1000000)
    normal_packet_size_range = range(20, 1500)
    normal_flow_duration_range = range(50, 3600000)  # 50 ms to 1 hour
    normal_piat_range = range(0, 10000)  # 0 to 10 seconds
    normal_flow_start_end_range = range(0, int(1e12))  # Assuming timestamp in milliseconds

    minor_violations = 0
    major_violations = 0

    # Major violations
    if (row['proto'] not in normal_protocols or
        row['b_flowStart'] not in normal_flow_start_end_range or
        row['b_flowEnd'] not in normal_flow_start_end_range or
        row['flowEndReason'] == 1):
        major_violations += 1

    # Minor violations
    if (row['src_port'] not in normal_port_range or
        row['dst_port'] not in normal_port_range or
        row['pktTotalCount'] not in normal_pkt_count_range or
        row['octetTotalCount'] not in normal_octet_count_range or
        row['min_ps'] not in normal_packet_size_range or
        row['max_ps'] not in normal_packet_size_range or
        row['avg_ps'] not in normal_packet_size_range or
        row['std_dev_ps'] not in normal_packet_size_range or
        row['flowDuration'] not in normal_flow_duration_range or
        row['min_piat'] not in normal_piat_range or
        row['max_piat'] not in normal_piat_range or
        row['avg_piat'] not in normal_piat_range or
        row['std_dev_piat'] not in normal_piat_range or
        row['f_pktTotalCount'] not in normal_pkt_count_range or
        row['f_octetTotalCount'] not in normal_octet_count_range or
        row['f_min_ps'] not in normal_packet_size_range or
        row['f_max_ps'] not in normal_packet_size_range or
        row['f_avg_ps'] not in normal_packet_size_range or
        row['f_std_dev_ps'] not in normal_packet_size_range or
        row['f_flowDuration'] not in normal_flow_duration_range or
        row['f_min_piat'] not in normal_piat_range or
        row['f_max_piat'] not in normal_piat_range or
        row['f_avg_piat'] not in normal_piat_range or
        row['f_std_dev_piat'] not in normal_piat_range or
        row['b_pktTotalCount'] not in normal_pkt_count_range or
        row['b_octetTotalCount'] not in normal_octet_count_range or
        row['b_min_ps'] not in normal_packet_size_range or
        row['b_max_ps'] not in normal_packet_size_range or
        row['b_avg_ps'] not in normal_packet_size_range or
        row['b_std_dev_ps'] not in normal_packet_size_range or
        row['b_flowDuration'] not in normal_flow_duration_range or
        row['b_min_piat'] not in normal_piat_range or
        row['b_max_piat'] not in normal_piat_range or
        row['b_avg_piat'] not in normal_piat_range or
        row['b_std_dev_piat'] not in normal_piat_range):
        minor_violations += 1

    # Rule for labeling as 'malicious'
    if major_violations > 0 or minor_violations >= 2:
        return 'malicious'
    else:
        return 'benign'

# Apply the labeling function
df['category'] = df.apply(label_traffic, axis=1)
print("Labeling function Applied - Time:", datetime.datetime.now() - start_time)

# Encoding the categorical labels
label_encoder = LabelEncoder()
df['category'] = label_encoder.fit_transform(df['category'])
print("Categorical labels Encoded - Time:", datetime.datetime.now() - start_time)

# Define features and target variable
X = df.drop(columns=['category'])
y = df['category']
print("Features and target variable Defined - Time:", datetime.datetime.now() - start_time)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Data Split into Train and Test Sets - Time:", datetime.datetime.now() - start_time)

# Define preprocessing steps for numeric and categorical columns
numeric_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = ['application_protocol', 'web_service']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing to training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)
print("Preprocessing Applied - Time:", datetime.datetime.now() - start_time)

# Split a subset for SMOTE
X_train_subset, _, y_train_subset, _ = train_test_split(X_train_preprocessed, y_train, test_size=0.5, random_state=42, stratify=y_train)

# Find the smallest class size in the training subset
smallest_class_size = y_train_subset.value_counts().min()

# Set k_neighbors for SMOTE
k_neighbors = min(smallest_class_size - 1, 5)  # Ensure it's at least 1

smote = SMOTE(sampling_strategy='auto', k_neighbors=k_neighbors, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_subset, y_train_subset)
print("SMOTE Applied - Time:", datetime.datetime.now() - start_time)

# Define classifiers with simplified parameters
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced_subsample')
ab_clf = AdaBoostClassifier(random_state=42, n_estimators=50)
gb_clf = GradientBoostingClassifier(random_state=42, n_estimators=100)

# Fit the classifiers to the resampled training data
rf_clf.fit(X_train_resampled, y_train_resampled)
ab_clf.fit(X_train_resampled, y_train_resampled)
gb_clf.fit(X_train_resampled, y_train_resampled)
print("Classifiers Fitted - Time:", datetime.datetime.now() - start_time)

# Make predictions on the test set
rf_test_predictions = rf_clf.predict(X_test_preprocessed)
ab_test_predictions = ab_clf.predict(X_test_preprocessed)
gb_test_predictions = gb_clf.predict(X_test_preprocessed)
print("Predictions Made - Time:", datetime.datetime.now() - start_time)

# Calculate and print accuracy and F1-score for each classifier
rf_accuracy = accuracy_score(y_test, rf_test_predictions)
rf_f1_score = f1_score(y_test, rf_test_predictions)
ab_accuracy = accuracy_score(y_test, ab_test_predictions)
ab_f1_score = f1_score(y_test, ab_test_predictions)
gb_accuracy = accuracy_score(y_test, gb_test_predictions)
gb_f1_score = f1_score(y_test, gb_test_predictions)

print("Random Forest Classifier:")
print("Accuracy:", rf_accuracy)
print("F1-Score:", rf_f1_score)

print("\nAdaBoost Classifier:")
print("Accuracy:", ab_accuracy)
print("F1-Score:", ab_f1_score)

print("\nGradient Boosting Classifier:")
print("Accuracy:", gb_accuracy)
print("F1-Score:", gb_f1_score)

print("Accuracy and F1-score Calculated - Time:", datetime.datetime.now() - start_time)


Dataset Loaded - Time: 0:00:00.027431
Labeling function Applied - Time: 1:24:08.686636
Categorical labels Encoded - Time: 1:24:08.688882
Features and target variable Defined - Time: 1:24:08.696614
Data Split into Train and Test Sets - Time: 1:24:08.699962
Preprocessing Applied - Time: 1:24:08.728797
SMOTE Applied - Time: 1:24:08.817894
Classifiers Fitted - Time: 1:24:09.208935
Predictions Made - Time: 1:24:09.243030
Random Forest Classifier:
Accuracy: 0.75
F1-Score: 0.5

AdaBoost Classifier:
Accuracy: 0.625
F1-Score: 0.4

Gradient Boosting Classifier:
Accuracy: 0.375
F1-Score: 0.28571428571428575
Accuracy and F1-score Calculated - Time: 1:24:09.254310
