In [None]:
import pandas as pd
import numpy as np
import socket
import struct
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Convert IPv4 address to an integer
def ip_to_int(ip):
    try:
        return struct.unpack("!I", socket.inet_aton(ip.strip()))[0]
    except:
        return np.nan  # Handle missing or malformed IPs

# Load dataset (Replace with actual file path)
file_path = "datasets/ddos2/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"  # Update with your file
df = pd.read_csv(file_path)

# Remove leading spaces in column names
df.columns = df.columns.str.strip()

# Convert IPs to numerical values
df['Source_IP_Int'] = df['Source IP'].apply(ip_to_int)
df['Destination_IP_Int'] = df['Destination IP'].apply(ip_to_int)

# Drop original IP columns
df.drop(['Source IP', 'Source IP'], axis=1, inplace=True)

# Convert Timestamp to numerical format if present
if 'Timestamp' in df.columns:
    #df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce').astype('int64') // 10**9  # Convert to UNIX timestamp
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S', errors='coerce').astype('int64')
# **Handle Inf Values**
df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Convert inf values to NaN

# Fill missing values (including former inf values) with the column median
df.fillna(df.median(numeric_only=True), inplace=True)

# Apply Min-Max Scaling to numerical columns
scaler = MinMaxScaler()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns  # Select only numeric columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Label Encode the 'Label' column (if categorical)
# if df['Label'].dtype == 'object':
#     label_encoder = LabelEncoder()
#     df['Label'] = label_encoder.fit_transform(df['Label'])

# Save the processed dataset
df.to_csv("./datasets/scaled1.csv", index=False)

# Print first few rows
print(df.head())



                                   Flow ID  Source Port Destination IP  \
0  192.168.10.5-104.16.207.165-54865-443-6     0.006760   192.168.10.5   
1    192.168.10.5-104.16.28.216-55054-80-6     0.001221   192.168.10.5   
2    192.168.10.5-104.16.28.216-55055-80-6     0.001221   192.168.10.5   
3  192.168.10.16-104.17.241.25-46236-443-6     0.006760  192.168.10.16   
4  192.168.10.5-104.19.196.102-54863-443-6     0.006760   192.168.10.5   

   Destination Port  Protocol  Timestamp  Flow Duration  Total Fwd Packets  \
0          0.837225  0.352941        0.0   3.333335e-08           0.000518   
1          0.840109  0.352941        0.0   9.166671e-07           0.000000   
2          0.840124  0.352941        0.0   4.416669e-07           0.000000   
3          0.705548  0.352941        0.0   2.916668e-07           0.000000   
4          0.837194  0.352941        0.0   3.333335e-08           0.000518   

   Total Backward Packets  Total Length of Fwd Packets  ...  Active Std  \
0          

In [2]:
import pandas as pd
file_path = "scaled_network_traffic_cleaned.csv"  # Update with your file
df = pd.read_csv(file_path)
df=pd.DataFrame(df)
print(df.columns)

Index(['Flow ID', 'Source Port', 'Destination Port', 'Protocol', 'Timestamp',
       'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
       'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Leng