In [1]:
# import used library
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from google.colab import drive
import shutil

In [2]:
# Download latest version
path = kagglehub.dataset_download("dhoogla/nftoniotv2")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/dhoogla/nftoniotv2?dataset_version_number=2...


100%|██████████| 174M/174M [00:02<00:00, 71.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/dhoogla/nftoniotv2/versions/2


In [3]:
# Define the path to the Parquet file
parquet_file_path = f"{path}/NF-ToN-IoT-V2.parquet"

# Load the Parquet file into a pandas DataFrame
full_df = pd.read_parquet(parquet_file_path)

# Replace inf and -inf with NaN
full_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaN values
full_df.dropna(inplace=True)

In [4]:
# Define constant values
# calculate sampling step
final_sample_amount = 15000
sampling_step = 13135881 // final_sample_amount

In [5]:
print(f"Shape of primary data set is {full_df.shape}")
print(f"The number of samples in main data set is equal to: {len(full_df)}")

Shape of primary data set is (13135881, 43)
The number of samples in main data set is equal to: 13135881


In [6]:
# Assuming full_df is your existing DataFrame
# Create a new DataFrame with rows at multiple indices of sampling_step
shorter_df = full_df.iloc[range(0, len(full_df), sampling_step)]

# Save the new DataFrame to a CSV file
shorter_df.to_csv('shorter dataset.csv', index=False)

In [7]:
print(f"Shape of secondary data set is {shorter_df.shape}")
print(f"The number of samples in shortened data set is equal to: {len(shorter_df)}")

Shape of secondary data set is (15013, 43)
The number of samples in shortened data set is equal to: 15013


In [8]:
# Assuming shorter_df is your DataFrame
# Get the last column name (assuming it's the label column)
label_column = full_df.columns[-1]

# Count occurrences of each label
label_counts = full_df[label_column].value_counts()

# Calculate percentages
label_percentages = (label_counts / len(full_df)) * 100

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Count': label_counts,
    'Percentage': label_percentages
})

# Print the summary
print("Summary of full data set")
print(summary_df)

Summary of full data set
              Count  Percentage
Attack                         
Benign      3601284   27.415626
scanning    3002169   22.854721
xss         2449955   18.650862
ddos        1746590   13.296329
password     993718    7.564913
injection    660467    5.027961
dos          654359    4.981463
backdoor      16259    0.123775
mitm           7723    0.058793
ransomware     3357    0.025556


In [9]:
# Assuming shorter_df is your DataFrame
# Get the last column name (assuming it's the label column)
label_column = shorter_df.columns[-1]

# Count occurrences of each label
label_counts = shorter_df[label_column].value_counts()

# Calculate percentages
label_percentages = (label_counts / len(shorter_df)) * 100

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Count': label_counts,
    'Percentage': label_percentages
})

# Print the summary
print("Summary of shortened data set")
print(summary_df)

Summary of shortened data set
            Count  Percentage
Attack                       
Benign       4094   27.269700
scanning     3437   22.893492
xss          2803   18.670486
ddos         2035   13.554919
password     1149    7.653367
dos           741    4.935722
injection     726    4.835809
backdoor       17    0.113235
mitm            6    0.039965
ransomware      5    0.033304


In [10]:
# Print one row of the dataset
print(shorter_df.iloc[658])  # Prints example row of the DataFrame

L4_SRC_PORT                     64263
L4_DST_PORT                     61532
PROTOCOL                            6
L7_PROTO                          0.0
IN_BYTES                           44
IN_PKTS                             1
OUT_BYTES                           0
OUT_PKTS                            0
TCP_FLAGS                           2
CLIENT_TCP_FLAGS                    2
SERVER_TCP_FLAGS                    0
FLOW_DURATION_MILLISECONDS          0
DURATION_IN                         0
DURATION_OUT                        0
MIN_TTL                             0
MAX_TTL                             0
LONGEST_FLOW_PKT                   44
SHORTEST_FLOW_PKT                  44
MIN_IP_PKT_LEN                      0
MAX_IP_PKT_LEN                     44
SRC_TO_DST_SECOND_BYTES          44.0
DST_TO_SRC_SECOND_BYTES           0.0
RETRANSMITTED_IN_BYTES              0
RETRANSMITTED_IN_PKTS               0
RETRANSMITTED_OUT_BYTES             0
RETRANSMITTED_OUT_PKTS              0
SRC_TO_DST_A

In [11]:
# Assuming shorter_df is your DataFrame
# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(shorter_df, test_size=0.2, random_state=42)

# Function to calculate label percentages
def label_percentage(df):
    label_column = df.columns[-1]  # Assuming the last column is the label column
    label_counts = df[label_column].value_counts()
    percentages = (label_counts / len(df)) * 100
    return pd.DataFrame({'Count': label_counts, 'Percentage': percentages})

# Calculate percentages for training and testing sets
train_summary = label_percentage(train_df)
test_summary = label_percentage(test_df)

# Print the summaries
print("Training Set Label Summary:")
print(train_summary)

print("\nTesting Set Label Summary:")
print(test_summary)

Training Set Label Summary:
            Count  Percentage
Attack                       
Benign       3270   27.227311
scanning     2722   22.664446
xss          2235   18.609492
ddos         1645   13.696919
password      930    7.743547
injection     601    5.004163
dos           585    4.870941
backdoor       13    0.108243
mitm            5    0.041632
ransomware      4    0.033306

Testing Set Label Summary:
            Count  Percentage
Attack                       
Benign        824   27.439227
scanning      715   23.809524
xss           568   18.914419
ddos          390   12.987013
password      219    7.292707
dos           156    5.194805
injection     125    4.162504
backdoor        4    0.133200
ransomware      1    0.033300
mitm            1    0.033300


In [12]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def min_max_normalize(train_df, test_df):
    """
    Normalize the training and testing datasets using Min-Max normalization.

    Parameters:
    - train_df: DataFrame containing the training data.
    - test_df: DataFrame containing the testing data.

    Returns:
    - normalized_train_df: DataFrame with normalized training data.
    - normalized_test_df: DataFrame with normalized testing data.
    """

    # Create a MinMaxScaler object
    scaler = MinMaxScaler()

    # Separate features from labels in the training set
    train_features = train_df.iloc[:, :-2]  # All columns except the last two
    train_labels = train_df.iloc[:, -2:]     # Last two columns (labels)

    # Fit the scaler on the training features and transform them
    normalized_train_features = scaler.fit_transform(train_features)

    # Create a new DataFrame for normalized training data
    normalized_train_df = pd.DataFrame(normalized_train_features, columns=train_features.columns)

    # Add back the labels to the normalized training DataFrame
    normalized_train_df = pd.concat([normalized_train_df, train_labels.reset_index(drop=True)], axis=1)

    # Separate features from labels in the testing set
    test_features = test_df.iloc[:, :-2]  # All columns except the last two
    test_labels = test_df.iloc[:, -2:]     # Last two columns (labels)

    # Transform the testing features using the same scaler
    normalized_test_features = scaler.transform(test_features)

    # Create a new DataFrame for normalized testing data
    normalized_test_df = pd.DataFrame(normalized_test_features, columns=test_features.columns)

    # Add back the labels to the normalized testing DataFrame
    normalized_test_df = pd.concat([normalized_test_df, test_labels.reset_index(drop=True)], axis=1)

    return normalized_train_df, normalized_test_df

# Example usage:
# Assuming you have your train and test DataFrames ready as train_df and test_df
# normalized_train, normalized_test = normalize_datasets(train_df, test_df)

# Normalize both training and testing sets
train_normalized, test_normalized = min_max_normalize(train_df, test_df)

In [13]:
# Print one row of the dataset
print(train_normalized.iloc[658])  # Prints example row of the DataFrame

L4_SRC_PORT                    0.593673
L4_DST_PORT                    0.000809
PROTOCOL                       0.280702
L7_PROTO                       0.021756
IN_BYTES                       0.000112
IN_PKTS                             0.0
OUT_BYTES                      0.005123
OUT_PKTS                       0.006711
TCP_FLAGS                           0.0
CLIENT_TCP_FLAGS                    0.0
SERVER_TCP_FLAGS                    0.0
FLOW_DURATION_MILLISECONDS          0.0
DURATION_IN                         0.0
DURATION_OUT                        0.0
MIN_TTL                             0.0
MAX_TTL                             0.0
LONGEST_FLOW_PKT               0.063594
SHORTEST_FLOW_PKT              0.060185
MIN_IP_PKT_LEN                  0.72093
MAX_IP_PKT_LEN                 0.063594
SRC_TO_DST_SECOND_BYTES             0.0
DST_TO_SRC_SECOND_BYTES             0.0
RETRANSMITTED_IN_BYTES              0.0
RETRANSMITTED_IN_PKTS               0.0
RETRANSMITTED_OUT_BYTES             0.0


In [18]:
# Save normalized training and testing sets to CSV files
train_normalized.to_csv('normalized_train.csv', index=False)
test_normalized.to_csv('normalized_test.csv', index=False)

# Mount Google Drive
drive.mount('/content/drive')

# Move the CSV files to Google Drive
drive_path = '/content/drive/MyDrive/IDS Dataset 2/'
shutil.move('normalized_train.csv', drive_path + 'NF-ToN-IoT-V2_' + str(final_sample_amount) + ' samples_minmax_normalized_train.csv')
shutil.move('normalized_test.csv', drive_path + 'NF-ToN-IoT-V2_' + str(final_sample_amount) + ' samples_minmax_normalized_test.csv')

print("Files have been uploaded to Google Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files have been uploaded to Google Drive.
