In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dhoogla/nfbotiotv2")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/dhoogla/nfbotiotv2/versions/2


In [2]:
import pandas as pd

# Define the path to the Parquet file
parquet_file_path = f"{path}/NF-BoT-IoT-V2.parquet"

# Load the Parquet file into a pandas DataFrame
full_df = pd.read_parquet(parquet_file_path)

In [3]:
print(f"The number of samples in main data set is equal to: {len(full_df)}")

The number of samples in main data set is equal to: 30420086


In [4]:
# Assuming full_df is your existing DataFrame
# Create a new DataFrame with rows at indices 0, 60, 120, ...
shorter_df = full_df.iloc[range(0, len(full_df), 60)]

# Save the new DataFrame to a CSV file
shorter_df.to_csv('shorter dataset.csv', index=False)

In [5]:
print(f"The number of samples in shortened data set is equal to: {len(shorter_df)}")

The number of samples in shortened data set is equal to: 507002


In [6]:
# Print one row of the dataset
print(shorter_df.iloc[658])  # Prints example row of the DataFrame

L4_SRC_PORT                     -11611
L4_DST_PORT                         80
PROTOCOL                            17
L7_PROTO                         188.0
IN_BYTES                            56
IN_PKTS                              2
OUT_BYTES                            0
OUT_PKTS                             0
TCP_FLAGS                            0
CLIENT_TCP_FLAGS                     0
SERVER_TCP_FLAGS                     0
FLOW_DURATION_MILLISECONDS     4293967
DURATION_IN                       1000
DURATION_OUT                         0
MIN_TTL                             64
MAX_TTL                             64
LONGEST_FLOW_PKT                    28
SHORTEST_FLOW_PKT                   28
MIN_IP_PKT_LEN                       0
MAX_IP_PKT_LEN                      28
SRC_TO_DST_SECOND_BYTES         2828.0
DST_TO_SRC_SECOND_BYTES            0.0
RETRANSMITTED_IN_BYTES               0
RETRANSMITTED_IN_PKTS                0
RETRANSMITTED_OUT_BYTES              0
RETRANSMITTED_OUT_PKTS   

In [7]:
# Assuming shorter_df is your DataFrame
# Select all columns except the last two
data_to_normalize = shorter_df.iloc[:, :-2]

# Perform min-max normalization
normalized_data = (data_to_normalize - data_to_normalize.min()) / (data_to_normalize.max() - data_to_normalize.min())

# Concatenate the normalized data with the last two columns (labels)
min_max_normalized_df = pd.concat([normalized_data, shorter_df.iloc[:, -2:]], axis=1)

# Save the new normalized DataFrame to a CSV file
min_max_normalized_df.to_csv('min max normalized.csv', index=False)

In [8]:
# Print one row of the dataset
print(min_max_normalized_df.iloc[658])  # Prints example row of the DataFrame

L4_SRC_PORT                    0.322835
L4_DST_PORT                    0.501312
PROTOCOL                            1.0
L7_PROTO                       0.767347
IN_BYTES                       0.000001
IN_PKTS                        0.000026
OUT_BYTES                           0.0
OUT_PKTS                            0.0
TCP_FLAGS                           0.0
CLIENT_TCP_FLAGS                    0.0
SERVER_TCP_FLAGS                    0.0
FLOW_DURATION_MILLISECONDS     0.999767
DURATION_IN                    0.392619
DURATION_OUT                        0.0
MIN_TTL                             0.5
MAX_TTL                             0.5
LONGEST_FLOW_PKT                    0.0
SHORTEST_FLOW_PKT                   0.0
MIN_IP_PKT_LEN                      0.0
MAX_IP_PKT_LEN                      0.0
SRC_TO_DST_SECOND_BYTES             0.0
DST_TO_SRC_SECOND_BYTES             0.0
RETRANSMITTED_IN_BYTES              0.0
RETRANSMITTED_IN_PKTS               0.0
RETRANSMITTED_OUT_BYTES             0.0


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

# فرض کنید DataFrame شما در متغیر features قرار دارد

# مرحله 1: جداسازی ویژگی‌ها و برچسب‌ها
features = shorter_df.iloc[:, :-2]  # همه ستون‌ها به جز دو ستون آخر برای ویژگی‌ها
labels = shorter_df.iloc[:, -2:]    # دو ستون آخر برای برچسب‌ها

# مرحله 2: جایگزینی مقادیر بی‌نهایت با NaN
features.replace([np.inf, -np.inf], np.nan, inplace=True)

# مرحله 3: حذف سطرهای دارای NaN
features.dropna(inplace=True)

# مرحله 4: اطمینان از نوع داده‌ها
features = features.astype(np.float64)

# مرحله 5: مقداردهی اولیه RobustScaler
scaler = RobustScaler()

# مرحله 6: اعمال مقیاس‌بندی با RobustScaler
normalized_features = scaler.fit_transform(features)
normalized_features = pd.DataFrame(normalized_features, columns=features.columns)

# مرحله 7: اطمینان از محدوده [-1, 1] برای داده‌های نرمال‌شده
# این مرحله اختیاری است و فقط برای بررسی است
print(normalized_features.describe())

# مرحله 8: ترکیب ویژگی‌های نرمال‌شده با برچسب‌ها
normalized_df = pd.concat([normalized_features, labels.reset_index(drop=True)], axis=1)

# مرحله 9: ذخیره داده‌های نرمال‌شده در یک فایل CSV
normalized_df.to_csv('NF_BoT_IoT_V2_Normalized.csv', index=False)


         L4_SRC_PORT    L4_DST_PORT       PROTOCOL       L7_PROTO  \
count  507002.000000  507002.000000  507002.000000  507002.000000   
mean        0.024173      35.242429       0.471266       0.469649   
std         0.577717    3834.490959       0.499221       0.500992   
min        -0.952041  -32848.000000      -0.454545      -0.038674   
25%        -0.475330       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.524670       0.000000       1.000000       1.000000   
max         1.058852   32676.000000       1.000000       1.314917   

            IN_BYTES        IN_PKTS     OUT_BYTES       OUT_PKTS  \
count  507002.000000  507002.000000  5.070020e+05  507002.000000   
mean        2.226651       0.805405  4.787356e+01       0.190181   
std       440.429495     108.355919  5.620018e+03       6.108662   
min        -0.375000      -1.000000  0.000000e+00       0.000000   
25%        -0.250000       0.000000  0

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# مرحله 8: ذخیره فایل نهایی به صورت CSV
normalized_df.to_csv('/content/drive/MyDrive/NF_BoT_IoT_V2_Normalized.csv', index=False)

In [12]:
normalized_df.head(5)

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,-0.008469,0.0,0.0,0.0,0.125,-1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DoS
1,0.376404,0.0,1.0,1.0,-0.25,0.0,0.0,0.0,-1.0,-1.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DDoS
2,-0.788739,0.0,0.0,0.0,0.928571,1.0,44.0,1.0,10.0,2.0,...,0.0,29200.0,35840.0,140.0,0.0,0.0,0.0,0.0,1,Reconnaissance
3,-0.297054,-5820.0,0.0,-0.038674,-0.303571,-1.0,40.0,1.0,10.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Reconnaissance
4,0.72823,0.0,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,DoS


In [13]:
min_max_normalized_df.to_csv('/content/drive/MyDrive/min max normalized.csv', index=False)

In [14]:
min_max_normalized_df.head()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,TCP_FLAGS,CLIENT_TCP_FLAGS,...,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack
0,0.46923,0.501312,0.3125,0.028571,3.128673e-06,0.0,0.0,0.0,0.009346,0.010101,...,0.007813,0.0,0.0,0.0,0.0,0.0,0.091511,0.0,1,DoS
60,0.660624,0.501312,1.0,0.767347,7.821683e-07,2.6e-05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.091511,0.0,1,DDoS
120,0.081209,0.501312,0.3125,0.028571,8.156898e-06,5.3e-05,2e-05,0.000424,0.102804,0.030303,...,0.007813,1.0,0.555556,0.555556,0.0,0.0,0.091511,0.0,1,Reconnaissance
180,0.325719,0.41249,0.3125,0.0,4.469533e-07,0.0,1.8e-05,0.000424,0.102804,0.010101,...,0.015625,0.0,0.0,0.0,0.0,0.0,0.091511,0.0,1,Reconnaissance
240,0.835584,0.501312,0.3125,0.028571,7.039515e-06,2.6e-05,0.0,0.0,0.009346,0.010101,...,0.007813,0.0,0.0,0.0,0.0,0.0,0.091511,0.0,1,DoS
