In [1]:
# import used library
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from google.colab import drive
import shutil

In [2]:
# Define constant values
# calculate sampling step
final_sample_amount = 5000
sampling_step = 30420086 // final_sample_amount

In [3]:
# Download latest version
path = kagglehub.dataset_download("dhoogla/nfbotiotv2")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/dhoogla/nfbotiotv2/versions/2


In [4]:
# Define the path to the Parquet file
parquet_file_path = f"{path}/NF-BoT-IoT-V2.parquet"

# Load the Parquet file into a pandas DataFrame
full_df = pd.read_parquet(parquet_file_path)

# Replace inf and -inf with NaN
full_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaN values
full_df.dropna(inplace=True)

In [5]:
print(f"Shape of primary data set is {full_df.shape}")
print(f"The number of samples in main data set is equal to: {len(full_df)}")

Shape of primary data set is (30420086, 43)
The number of samples in main data set is equal to: 30420086


In [6]:
# Assuming full_df is your existing DataFrame
# Create a new DataFrame with rows at multiple indices of sampling_step
shorter_df = full_df.iloc[range(0, len(full_df), sampling_step)]

# Save the new DataFrame to a CSV file
shorter_df.to_csv('shorter dataset.csv', index=False)

In [7]:
print(f"Shape of secondary data set is {shorter_df.shape}")
print(f"The number of samples in shortened data set is equal to: {len(shorter_df)}")

Shape of secondary data set is (5001, 43)
The number of samples in shortened data set is equal to: 5001


In [8]:
# Assuming shorter_df is your DataFrame
# Get the last column name (assuming it's the label column)
label_column = shorter_df.columns[-1]

# Count occurrences of each label
label_counts = shorter_df[label_column].value_counts()

# Calculate percentages
label_percentages = (label_counts / len(shorter_df)) * 100

# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Count': label_counts,
    'Percentage': label_percentages
})

# Print the summary
print(summary_df)

                Count  Percentage
Attack                           
DDoS             2388   47.750450
DoS              2207   44.131174
Reconnaissance    378    7.558488
Benign             27    0.539892
Theft               1    0.019996


In [9]:
# Print one row of the dataset
print(shorter_df.iloc[658])  # Prints example row of the DataFrame

L4_SRC_PORT                     -25780
L4_DST_PORT                         80
PROTOCOL                            17
L7_PROTO                         188.0
IN_BYTES                            56
IN_PKTS                              2
OUT_BYTES                            0
OUT_PKTS                             0
TCP_FLAGS                            0
CLIENT_TCP_FLAGS                     0
SERVER_TCP_FLAGS                     0
FLOW_DURATION_MILLISECONDS     4294192
DURATION_IN                        775
DURATION_OUT                         0
MIN_TTL                             64
MAX_TTL                             64
LONGEST_FLOW_PKT                    28
SHORTEST_FLOW_PKT                   28
MIN_IP_PKT_LEN                       0
MAX_IP_PKT_LEN                      28
SRC_TO_DST_SECOND_BYTES         2828.0
DST_TO_SRC_SECOND_BYTES            0.0
RETRANSMITTED_IN_BYTES               0
RETRANSMITTED_IN_PKTS                0
RETRANSMITTED_OUT_BYTES              0
RETRANSMITTED_OUT_PKTS   

In [10]:
# Assuming shorter_df is your DataFrame
# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(shorter_df, test_size=0.2, random_state=42)

# Function to calculate label percentages
def label_percentage(df):
    label_column = df.columns[-1]  # Assuming the last column is the label column
    label_counts = df[label_column].value_counts()
    percentages = (label_counts / len(df)) * 100
    return pd.DataFrame({'Count': label_counts, 'Percentage': percentages})

# Calculate percentages for training and testing sets
train_summary = label_percentage(train_df)
test_summary = label_percentage(test_df)

# Print the summaries
print("Training Set Label Summary:")
print(train_summary)

print("\nTesting Set Label Summary:")
print(test_summary)

Training Set Label Summary:
                Count  Percentage
Attack                           
DDoS             1904      47.600
DoS              1771      44.275
Reconnaissance    303       7.575
Benign             21       0.525
Theft               1       0.025

Testing Set Label Summary:
                Count  Percentage
Attack                           
DDoS              484   48.351648
DoS               436   43.556444
Reconnaissance     75    7.492507
Benign              6    0.599401


**برچسب‌های داده هم نرمال شده است- نمی‌دانم کار درستی است یا خیر**

In [11]:
# Function to perform min-max normalization using training set parameters
def min_max_normalize(train_df, test_df):
    # Select all columns except the last one (label column)
    train_features = train_df.iloc[:, :-1]
    test_features = test_df.iloc[:, :-1]

    # Calculate min and max from the training set
    min_values = train_features.min()
    max_values = train_features.max()

    # Normalize training features
    normalized_train_features = (train_features - min_values) / (max_values - min_values)

    # Normalize testing features using training set min and max
    normalized_test_features = (test_features - min_values) / (max_values - min_values)

    # Combine normalized features with the label column
    normalized_train_df = pd.concat([normalized_train_features, train_df.iloc[:, -1]], axis=1)
    normalized_test_df = pd.concat([normalized_test_features, test_df.iloc[:, -1]], axis=1)

    return normalized_train_df, normalized_test_df

# Normalize both training and testing sets
train_normalized, test_normalized = min_max_normalize(train_df, test_df)

In [12]:
# Save normalized training and testing sets to CSV files
train_normalized.to_csv('normalized_train.csv', index=False)
test_normalized.to_csv('normalized_test.csv', index=False)

# Mount Google Drive
drive.mount('/content/drive')

# Move the CSV files to Google Drive
drive_path = '/content/drive/MyDrive/IDS data sets/'
shutil.move('normalized_train.csv', drive_path + 'NF-BoT-IoT-V2_' + str(final_sample_amount) + ' samples_minmax_normalized_train.csv')
shutil.move('normalized_test.csv', drive_path + 'NF-BoT-IoT-V2_' + str(final_sample_amount) + ' samples_minmax_normalized_test.csv')

print("Files have been uploaded to Google Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files have been uploaded to Google Drive.


In [13]:
# Print one row of the dataset
print(train_normalized.iloc[658])  # Prints example row of the DataFrame

L4_SRC_PORT                    0.627301
L4_DST_PORT                    0.501229
PROTOCOL                            1.0
L7_PROTO                       0.789916
IN_BYTES                       0.000529
IN_PKTS                        0.017857
OUT_BYTES                           0.0
OUT_PKTS                            0.0
TCP_FLAGS                           0.0
CLIENT_TCP_FLAGS                    0.0
SERVER_TCP_FLAGS                    0.0
FLOW_DURATION_MILLISECONDS     0.999804
DURATION_IN                    0.343928
DURATION_OUT                        0.0
MIN_TTL                        0.503937
MAX_TTL                        0.503937
LONGEST_FLOW_PKT                    0.0
SHORTEST_FLOW_PKT                   0.0
MIN_IP_PKT_LEN                      0.0
MAX_IP_PKT_LEN                      0.0
SRC_TO_DST_SECOND_BYTES             0.0
DST_TO_SRC_SECOND_BYTES             0.0
RETRANSMITTED_IN_BYTES              0.0
RETRANSMITTED_IN_PKTS               0.0
RETRANSMITTED_OUT_BYTES             0.0


In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

# فرض کنید DataFrame شما در متغیر shorter_df قرار دارد

# مرحله 1: جداسازی ویژگی‌ها و برچسب‌ها
features = shorter_df.iloc[:, :-2]  # همه ستون‌ها به جز دو ستون آخر برای ویژگی‌ها
labels = shorter_df.iloc[:, -2:]    # دو ستون آخر برای برچسب‌ها

# مرحله 2: جایگزینی مقادیر بی‌نهایت با NaN
features.replace([np.inf, -np.inf], np.nan, inplace=True)

# مرحله 3: حذف سطرهای دارای NaN
features.dropna(inplace=True)

# مرحله 4: اطمینان از نوع داده‌ها
features = features.astype(np.float64)

# مرحله 5: جداسازی داده‌ها به مجموعه‌های آموزشی و آزمایشی (80/20)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# مرحله 6: مقداردهی اولیه RobustScaler
scaler = RobustScaler()

# مرحله 7: فیت کردن مقیاس‌ساز بر روی داده‌های آموزشی و نرمال‌سازی آن‌ها
X_train_normalized = scaler.fit_transform(X_train)
X_train_normalized = pd.DataFrame(X_train_normalized, columns=X_train.columns)

# مرحله 8: نرمال‌سازی داده‌های آزمایشی با استفاده از پارامترهای آموزشی
X_test_normalized = scaler.transform(X_test)
X_test_normalized = pd.DataFrame(X_test_normalized, columns=X_test.columns)

# مرحله 9: ترکیب ویژگی‌های نرمال‌شده با برچسب‌ها برای مجموعه‌های آموزشی و آزمایشی
normalized_train_df = pd.concat([X_train_normalized, y_train.reset_index(drop=True)], axis=1)
normalized_test_df = pd.concat([X_test_normalized, y_test.reset_index(drop=True)], axis=1)

# نمایش آمار توصیفی برای داده‌های نرمال‌شده (اختیاری)
print(normalized_train_df.describe())
print(normalized_test_df.describe())

       L4_SRC_PORT   L4_DST_PORT     PROTOCOL     L7_PROTO     IN_BYTES  \
count  4000.000000   4000.000000  4000.000000  4000.000000  4000.000000   
mean      0.004889     48.523250     0.475500     0.474067     0.486346   
std       0.581148   4047.142942     0.499462     0.501431    11.204755   
min      -0.986043 -32837.000000     0.000000    -0.038674    -0.375000   
25%      -0.498818      0.000000     0.000000     0.000000    -0.250000   
50%       0.000000      0.000000     0.000000     0.000000     0.000000   
75%       0.501182      0.000000     1.000000     1.000000     0.750000   
max       1.031371  32676.000000     1.000000     1.276243   707.906250   

           IN_PKTS    OUT_BYTES     OUT_PKTS    TCP_FLAGS  CLIENT_TCP_FLAGS  \
count  4000.000000  4000.000000  4000.000000  4000.000000       4000.000000   
mean      0.338750    10.019500     0.163750     0.859375         -0.300625   
std       2.759421    83.287051     0.582684     3.679459          1.408788   
min     

In [15]:
# Save normalized training and testing sets to CSV files
train_normalized.to_csv('normalized_train.csv', index=False)
test_normalized.to_csv('normalized_test.csv', index=False)

# Mount Google Drive
drive.mount('/content/drive')

# Move the CSV files to Google Drive
drive_path = '/content/drive/MyDrive/IDS data sets/'
shutil.move('normalized_train.csv', drive_path + 'NF-BoT-IoT-V2_' + str(final_sample_amount) + ' samples_RobustScaler_normalized_train.csv')
shutil.move('normalized_test.csv', drive_path + 'NF-BoT-IoT-V2_' + str(final_sample_amount) + ' samples_RobustScaler_normalized_test.csv')

print("Files have been uploaded to Google Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Files have been uploaded to Google Drive.
