In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample
from sklearn import preprocessing
from warnings import simplefilter
from imblearn.under_sampling import RandomUnderSampler

simplefilter(action='ignore', category=FutureWarning)


start_time = time.time()


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

processed_dataframes = []

std_scaler = StandardScaler()


def normalize_dataframe(df, columns_to_normalize):
    df[columns_to_normalize] = std_scaler.fit_transform(df[columns_to_normalize])
    return df


folder_path = '/content/drive/MyDrive/MachineLearningCVE/'


all_files = [
    "TWH.pcap_ISCX",
    "WWH.pcap_ISCX",
    "TWHMW.pcap_ISCX",
    "TWHAI.pcap_ISCX",
    "FWHM.pcap_ISCX",
    "FWHAP.pcap_ISCX",
    "FWHAD.pcap_ISCX"
]


for file_name in all_files:
    file_path = folder_path + file_name + ".csv"

   
    try:
        df = pd.read_csv(file_path, encoding='iso-8859-2', engine='python')
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding='utf-8', engine='python')

    df = pd.DataFrame(df)

   
    df.columns = df.columns.str.strip()

    df = df.drop(df[pd.isnull(df["Flow Duration"])].index)

    df.replace([np.inf, -np.inf], np.nan, inplace=True)


    df.dropna(inplace=True)

    numeric_columns = df.select_dtypes(include='number').columns
    df[numeric_columns] = df[numeric_columns].astype(np.float32)
    df = normalize_dataframe(df.copy(), numeric_columns)


    string_columns = [col for col in df.columns if df[col].dtype == "object"]
    try:
        string_columns.remove('Label')
    except ValueError:
        pass


    label_encoder_X = LabelEncoder()
    for col in string_columns:
        try:
            df[col] = label_encoder_X.fit_transform(df[col])
        except:
            df[col] = df[col].replace('Infinity', -1)

    processed_dataframes.append(df)
    print(f"Preprocessing and undersampling of file {file_name} is done")

combined_dataframe = pd.concat(processed_dataframes, ignore_index=True)

combined_dataframe.to_csv("/content/combined_data.csv", index=False)
print("Concatenation and saving to CSV is done")


Preprocessing and undersampling of file Tuesday-WorkingHours.pcap_ISCX is done
Preprocessing and undersampling of file Wednesday-workingHours.pcap_ISCX is done
Preprocessing and undersampling of file Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX is done
Preprocessing and undersampling of file Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX is done
Preprocessing and undersampling of file Friday-WorkingHours-Morning.pcap_ISCX is done
Preprocessing and undersampling of file Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX is done
Preprocessing and undersampling of file Friday-WorkingHours-Afternoon-DDos.pcap_ISCX is done
Concatenation and saving to CSV is done


In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import numpy as np
import os


attack_types = ["Bot", "DDoS", "DoS GoldenEye", "DoS Hulk", "DoS Slowhttptest", "DoS slowloris", "FTP-Patator",
                "Heartbleed", "Infiltration", "PortScan", "SSH-Patator", "Web Attack - Brute Force",
                "Web Attack - Sql Injection", "Web Attack - XSS"]
benign_type = "BENIGN"


In [7]:

import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample
from sklearn import preprocessing
from warnings import simplefilter
from imblearn.under_sampling import RandomUnderSampler


simplefilter(action='ignore', category=FutureWarning)


start_time = time.time()

main_dataset = pd.read_csv("combined_data.csv")


attack_types = ["DDoS", "Infiltration", "PortScan", "SSH-Patator", "Web Attack  Brute Force",
                "Web Attack  Sql Injection", "Web Attack  XSS"]
benign_type = "BENIGN"

for attack_type in attack_types:

    attack_data = main_dataset[main_dataset["Label"] == attack_type]


    benign_data = main_dataset[main_dataset["Label"] == benign_type]

    combined_data = pd.concat([attack_data, benign_data], axis=0)

    combined_data = combined_data.sample(frac=1, random_state=42)

    output_filename = f"{attack_type}_vs_{benign_type}.csv"
    combined_data.to_csv(output_filename, index=False)
    print(f"Saved {output_filename}")


end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")


Saved DDoS_vs_BENIGN.csv
Saved Infiltration_vs_BENIGN.csv
Saved PortScan_vs_BENIGN.csv
Saved SSH-Patator_vs_BENIGN.csv
Saved Web Attack  Brute Force_vs_BENIGN.csv
Saved Web Attack  Sql Injection_vs_BENIGN.csv
Saved Web Attack  XSS_vs_BENIGN.csv
Execution time: 1826.76 seconds


In [8]:
import shutil


folder_path = '/content/drive/MyDrive/MLCVE/'


output_files = [
    "DDoS_vs_BENIGN.csv",
    "Infiltration_vs_BENIGN.csv",
    "PortScan_vs_BENIGN.csv",
    "SSH-Patator_vs_BENIGN.csv",
    "Web Attack  Brute Force_vs_BENIGN.csv",
    "Web Attack  Sql Injection_vs_BENIGN.csv",
    "Web Attack  XSS_vs_BENIGN.csv"
]

import os
os.makedirs(folder_path, exist_ok=True)


for file in output_files:
    try:
        
        if os.path.exists(file):
            shutil.move(file, folder_path + file)  # Move the file to Google Drive
            print(f"Moved {file} to Google Drive.")
        else:
            print(f"{file} does not exist locally.")
    except Exception as e:
        print(f"Error moving {file}: {e}")


Moved DDoS_vs_BENIGN.csv to Google Drive.
Moved Infiltration_vs_BENIGN.csv to Google Drive.
Moved PortScan_vs_BENIGN.csv to Google Drive.
Moved SSH-Patator_vs_BENIGN.csv to Google Drive.
Moved Web Attack  Brute Force_vs_BENIGN.csv to Google Drive.
Moved Web Attack  Sql Injection_vs_BENIGN.csv to Google Drive.
Moved Web Attack  XSS_vs_BENIGN.csv to Google Drive.


In [1]:
import pandas as pd
import glob

folder_path = '/content/drive/MyDrive/MLCVE/'


file_names = [
    'DDoS_vs_BENIGN.csv',
    'Infiltration_vs_BENIGN.csv',
    'PortScan_vs_BENIGN.csv',
    'SSH-Patator_vs_BENIGN.csv',
    'Web Attack  Brute Force_vs_BENIGN.csv',
    'Web Attack  Sql Injection_vs_BENIGN.csv',
    'Web Attack  XSS_vs_BENIGN.csv'
]

for file_name in file_names:
  
    file_path = folder_path + file_name

 
    data = pd.read_csv(file_path)

    # Count the number of benign and attack instances
    num_benign = (data['Label'] == 'BENIGN').sum()
    num_attack = (data['Label'] != 'BENIGN').sum()

    print(f"File: {file_name}")
    print(f"Number of Benign instances: {num_benign}")
    print(f"Number of Attack instances: {num_attack}")
    print("Shape of the dataset:", data.shape)
    print("-----------------------------")


File: DDoS_vs_BENIGN.csv
Number of Benign instances: 1741839
Number of Attack instances: 128025
Shape of the dataset: (1869864, 79)
-----------------------------
File: Infiltration_vs_BENIGN.csv
Number of Benign instances: 1741839
Number of Attack instances: 36
Shape of the dataset: (1741875, 79)
-----------------------------
File: PortScan_vs_BENIGN.csv
Number of Benign instances: 1741839
Number of Attack instances: 158804
Shape of the dataset: (1900643, 79)
-----------------------------
File: SSH-Patator_vs_BENIGN.csv
Number of Benign instances: 1741839
Number of Attack instances: 5897
Shape of the dataset: (1747736, 79)
-----------------------------
File: Web Attack  Brute Force_vs_BENIGN.csv
Number of Benign instances: 1741839
Number of Attack instances: 0
Shape of the dataset: (1741839, 79)
-----------------------------
File: Web Attack  Sql Injection_vs_BENIGN.csv
Number of Benign instances: 1741839
Number of Attack instances: 0
Shape of the dataset: (1741839, 79)
-------------

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

# Folder path where your files are located in Google Drive
folder_path = '/content/drive/MyDrive/MLCVE/'


attack_types = ["DDoS", "Infiltration", "PortScan", "SSH-Patator", "Web Attack - Brute Force",
                "Web Attack - Sql Injection", "Web Attack - XSS"]
benign_type = "BENIGN"


def perform_feature_selection(data):
    X = data.drop(columns=["Label"])
    y = data["Label"].apply(lambda x: 1 if x != benign_type else 0)

    clf = RandomForestRegressor(n_estimators=100, random_state=42)
    clf.fit(X, y)

    importances = clf.feature_importances_
    return importances

for attack_type in attack_types:

    input_filename = f"{folder_path}{attack_type}_vs_{benign_type}.csv"

    try:
    
        attack_data = pd.read_csv(input_filename, low_memory=False)


        importances = perform_feature_selection(attack_data)


        importance_df = pd.DataFrame({"Feature": attack_data.drop(columns=["Label"]).columns,
                                      "Importance": importances})

 
        total_importance = importance_df["Importance"].sum()
        importance_df["Percentage"] = importance_df["Importance"] / total_importance * 100


        importance_df = importance_df.sort_values(by="Importance", ascending=False)


        print(f"\nTop 20 features and their percentages for {attack_type}:")
        print(importance_df.head(20))

        importance_filename = f"{folder_path}{attack_type}_importance.csv"
        importance_df.to_csv(importance_filename, index=False)
        print(f"Saved importance list for {attack_type}")

        
        plt.figure(figsize=(10, 6))
        top_20_df = importance_df.head(20)
        top_20_df.plot(kind="bar", x="Feature", y="Importance", legend=None)
        plt.title(f"Feature Importance for {attack_type}")
        plt.xlabel("Feature")
        plt.ylabel("Importance")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    except FileNotFoundError:
        print(f"File {input_filename} not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred while processing {attack_type}: {e}")

print("Feature selection and visualization completed for all attack types.")
