In [None]:
# import used library
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from google.colab import drive
import shutil

In [None]:
# Define constant values
# calculate sampling step
final_sample_amount = 15000
sampling_step = 30420086 // final_sample_amount

In [None]:
# Download latest version
path = kagglehub.dataset_download("dhoogla/nfbotiotv2")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/dhoogla/nfbotiotv2?dataset_version_number=2...


100%|██████████| 421M/421M [00:07<00:00, 58.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/dhoogla/nfbotiotv2/versions/2


In [None]:
# Define the path to the Parquet file
parquet_file_path = f"{path}/NF-BoT-IoT-V2.parquet"

# Load the Parquet file into a pandas DataFrame
full_df = pd.read_parquet(parquet_file_path)

# Replace inf and -inf with NaN
full_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaN values
full_df.dropna(inplace=True)

In [None]:
print(f"Shape of primary data set is {full_df.shape}")
print(f"The number of samples in main data set is equal to: {len(full_df)}")

Shape of primary data set is (30420086, 43)
The number of samples in main data set is equal to: 30420086


In [None]:
# Assuming full_df is your existing DataFrame
# Create a new DataFrame with rows at multiple indices of sampling_step
shorter_df = full_df.iloc[range(0, len(full_df), sampling_step)]

# Save the new DataFrame to a CSV file
shorter_df.to_csv('shorter dataset.csv', index=False)

In [None]:
print(f"Shape of secondary data set is {shorter_df.shape}")
print(f"The number of samples in shortened data set is equal to: {len(shorter_df)}")

Shape of secondary data set is (15001, 43)
The number of samples in shortened data set is equal to: 15001


In [None]:
# Assuming shorter_df is your DataFrame
# Get the last column name (assuming it's the label column)
label_column = shorter_df.columns[-1]

# Count occurrences of each label
label_counts = shorter_df[label_column].value_counts()

# Calculate percentages
label_percentages = (label_counts / len(shorter_df)) * 100

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Count': label_counts,
    'Percentage': label_percentages
})

# Print the summary
print(summary_df)

                Count  Percentage
Attack                           
DDoS             7070   47.130191
DoS              6719   44.790347
Reconnaissance   1139    7.592827
Benign             72    0.479968
Theft               1    0.006666


In [None]:
# Print one row of the dataset
print(shorter_df.iloc[658])  # Prints example row of the DataFrame

L4_SRC_PORT                      10504
L4_DST_PORT                         80
PROTOCOL                            17
L7_PROTO                         188.0
IN_BYTES                            56
IN_PKTS                              2
OUT_BYTES                            0
OUT_PKTS                             0
TCP_FLAGS                            0
CLIENT_TCP_FLAGS                     0
SERVER_TCP_FLAGS                     0
FLOW_DURATION_MILLISECONDS     4293951
DURATION_IN                       1016
DURATION_OUT                         0
MIN_TTL                             64
MAX_TTL                             64
LONGEST_FLOW_PKT                    28
SHORTEST_FLOW_PKT                   28
MIN_IP_PKT_LEN                       0
MAX_IP_PKT_LEN                      28
SRC_TO_DST_SECOND_BYTES         2828.0
DST_TO_SRC_SECOND_BYTES            0.0
RETRANSMITTED_IN_BYTES               0
RETRANSMITTED_IN_PKTS                0
RETRANSMITTED_OUT_BYTES              0
RETRANSMITTED_OUT_PKTS   

In [None]:
# Assuming shorter_df is your DataFrame
# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(shorter_df, test_size=0.2, random_state=42)

# Function to calculate label percentages
def label_percentage(df):
    label_column = df.columns[-1]  # Assuming the last column is the label column
    label_counts = df[label_column].value_counts()
    percentages = (label_counts / len(df)) * 100
    return pd.DataFrame({'Count': label_counts, 'Percentage': percentages})

# Calculate percentages for training and testing sets
train_summary = label_percentage(train_df)
test_summary = label_percentage(test_df)

# Print the summaries
print("Training Set Label Summary:")
print(train_summary)

print("\nTesting Set Label Summary:")
print(test_summary)

Training Set Label Summary:
                Count  Percentage
Attack                           
DDoS             5671   47.258333
DoS              5340   44.500000
Reconnaissance    934    7.783333
Benign             54    0.450000
Theft               1    0.008333

Testing Set Label Summary:
                Count  Percentage
Attack                           
DDoS             1399   46.617794
DoS              1379   45.951350
Reconnaissance    205    6.831056
Benign             18    0.599800


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def min_max_normalize(train_df, test_df):
    """
    Normalize the training and testing datasets using Min-Max normalization.

    Parameters:
    - train_df: DataFrame containing the training data.
    - test_df: DataFrame containing the testing data.

    Returns:
    - normalized_train_df: DataFrame with normalized training data.
    - normalized_test_df: DataFrame with normalized testing data.
    """

    # Create a MinMaxScaler object
    scaler = MinMaxScaler()

    # Separate features from labels in the training set
    train_features = train_df.iloc[:, :-2]  # All columns except the last two
    train_labels = train_df.iloc[:, -2:]     # Last two columns (labels)

    # Fit the scaler on the training features and transform them
    normalized_train_features = scaler.fit_transform(train_features)

    # Create a new DataFrame for normalized training data
    normalized_train_df = pd.DataFrame(normalized_train_features, columns=train_features.columns)

    # Add back the labels to the normalized training DataFrame
    normalized_train_df = pd.concat([normalized_train_df, train_labels.reset_index(drop=True)], axis=1)

    # Separate features from labels in the testing set
    test_features = test_df.iloc[:, :-2]  # All columns except the last two
    test_labels = test_df.iloc[:, -2:]     # Last two columns (labels)

    # Transform the testing features using the same scaler
    normalized_test_features = scaler.transform(test_features)

    # Create a new DataFrame for normalized testing data
    normalized_test_df = pd.DataFrame(normalized_test_features, columns=test_features.columns)

    # Add back the labels to the normalized testing DataFrame
    normalized_test_df = pd.concat([normalized_test_df, test_labels.reset_index(drop=True)], axis=1)

    return normalized_train_df, normalized_test_df

# Example usage:
# Assuming you have your train and test DataFrames ready as train_df and test_df
# normalized_train, normalized_test = normalize_datasets(train_df, test_df)

# Normalize both training and testing sets
train_normalized, test_normalized = min_max_normalize(train_df, test_df)

In [14]:
# Save normalized training and testing sets to CSV files
train_normalized.to_csv('normalized_train.csv', index=False)
test_normalized.to_csv('normalized_test.csv', index=False)

# Mount Google Drive
drive.mount('/content/drive')

# Move the CSV files to Google Drive
drive_path = '/content/drive/MyDrive/IDS Dataset 2/'
shutil.move('normalized_train.csv', drive_path + 'NF-BoT-IoT-V2_' + str(final_sample_amount) + ' samples_minmax_normalized_train.csv')
shutil.move('normalized_test.csv', drive_path + 'NF-BoT-IoT-V2_' + str(final_sample_amount) + ' samples_minmax_normalized_test.csv')

print("Files have been uploaded to Google Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files have been uploaded to Google Drive.


In [15]:
# Print one row of the dataset
print(train_normalized.iloc[658])  # Prints example row of the DataFrame

L4_SRC_PORT                     0.32815
L4_DST_PORT                    0.501252
PROTOCOL                            1.0
L7_PROTO                       0.789916
IN_BYTES                       0.002941
IN_PKTS                        0.021739
OUT_BYTES                           0.0
OUT_PKTS                            0.0
TCP_FLAGS                           0.0
CLIENT_TCP_FLAGS                    0.0
SERVER_TCP_FLAGS                    0.0
FLOW_DURATION_MILLISECONDS     0.999746
DURATION_IN                    0.440241
DURATION_OUT                        0.0
MIN_TTL                             1.0
MAX_TTL                             1.0
LONGEST_FLOW_PKT                    0.0
SHORTEST_FLOW_PKT                   0.0
MIN_IP_PKT_LEN                      0.0
MAX_IP_PKT_LEN                      0.0
SRC_TO_DST_SECOND_BYTES         0.00001
DST_TO_SRC_SECOND_BYTES             0.0
RETRANSMITTED_IN_BYTES              0.0
RETRANSMITTED_IN_PKTS               0.0
RETRANSMITTED_OUT_BYTES             0.0
