In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE


In [2]:
data = pd.read_csv("CSE-CIC-IDS2018.csv")
print(data.shape)


(1048575, 80)


In [3]:
data = data.drop_duplicates()
print(f"Shape after removing duplicates: {data.shape}")


Shape after removing duplicates: (1031018, 80)


In [4]:
data.columns = data.columns.str.strip().str.replace(' ', '_').str.lower()
data.columns

Index(['dst_port', 'protocol', 'timestamp', 'flow_duration', 'tot_fwd_pkts',
       'tot_bwd_pkts', 'totlen_fwd_pkts', 'totlen_bwd_pkts', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_mean', 'fwd_pkt_len_std',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean',
       'bwd_pkt_len_std', 'flow_byts/s', 'flow_pkts/s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_tot',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags',
       'bwd_urg_flags', 'fwd_header_len', 'bwd_header_len', 'fwd_pkts/s',
       'bwd_pkts/s', 'pkt_len_min', 'pkt_len_max', 'pkt_len_mean',
       'pkt_len_std', 'pkt_len_var', 'fin_flag_cnt', 'syn_flag_cnt',
       'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt',
       'cwe_flag_count', 'ece_flag_cnt', 'down/up_ratio', 'pkt_size_avg',
      

In [5]:
# Handle missing values for numerical columns
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())

# Handle missing values for categorical columns
categorical_cols = data.select_dtypes(include=['object', 'category']).columns
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

print("Missing values handled.")


Missing values handled.


In [6]:
encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = encoder.fit_transform(data[col])


In [7]:
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
if numerical_cols.empty:
    print("No numerical columns found!")
else:
    print(f"Numerical columns identified: {numerical_cols}")


Numerical columns identified: Index(['dst_port', 'protocol', 'flow_duration', 'tot_fwd_pkts', 'tot_bwd_pkts',
       'totlen_fwd_pkts', 'totlen_bwd_pkts', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_mean', 'fwd_pkt_len_std',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean',
       'bwd_pkt_len_std', 'flow_byts/s', 'flow_pkts/s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_tot',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags',
       'bwd_urg_flags', 'fwd_header_len', 'bwd_header_len', 'fwd_pkts/s',
       'bwd_pkts/s', 'pkt_len_min', 'pkt_len_max', 'pkt_len_mean',
       'pkt_len_std', 'pkt_len_var', 'fin_flag_cnt', 'syn_flag_cnt',
       'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt',
       'cwe_flag_count', 'ece_flag_cnt', 'down/up_ratio', 'pkt_

In [8]:
if 'label' in data.columns:
    X = data.drop('label', axis=1)
    y = data['label']
    
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    data = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['label'])], axis=1)

print(f"Shape after SMOTE: {data.shape}")


Shape after SMOTE: (2005383, 80)


In [9]:
for col in numerical_cols:
    z_scores = (data[col] - data[col].mean()) / data[col].std()
    data = data[(z_scores.abs() <= 3)]

print(f"Shape after removing outliers: {data.shape}")


Shape after removing outliers: (0, 80)


In [10]:
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
print("Numerical columns identified:", numerical_cols)


Numerical columns identified: Index(['dst_port', 'protocol', 'flow_duration', 'tot_fwd_pkts', 'tot_bwd_pkts',
       'totlen_fwd_pkts', 'totlen_bwd_pkts', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_mean', 'fwd_pkt_len_std',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean',
       'bwd_pkt_len_std', 'flow_byts/s', 'flow_pkts/s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_tot',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags',
       'bwd_urg_flags', 'fwd_header_len', 'bwd_header_len', 'fwd_pkts/s',
       'bwd_pkts/s', 'pkt_len_min', 'pkt_len_max', 'pkt_len_mean',
       'pkt_len_std', 'pkt_len_var', 'fin_flag_cnt', 'syn_flag_cnt',
       'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt',
       'cwe_flag_count', 'ece_flag_cnt', 'down/up_ratio', 'pkt_

In [11]:
# Check for missing values in the numerical columns
missing_values = data[numerical_cols].isnull().sum()
print("Missing values per numerical column:")
print(missing_values)


Missing values per numerical column:
dst_port         0
protocol         0
flow_duration    0
tot_fwd_pkts     0
tot_bwd_pkts     0
                ..
active_min       0
idle_mean        0
idle_std         0
idle_max         0
idle_min         0
Length: 78, dtype: int64


In [12]:
# Fill missing values with the mean for each column
data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].mean())


In [13]:
# Check the shape of the DataFrame
print("Data shape:", data.shape)


Data shape: (0, 80)


In [14]:
# Check the first few rows of the DataFrame to verify that data is loaded
print(data.head())


Empty DataFrame
Columns: [dst_port, protocol, timestamp, flow_duration, tot_fwd_pkts, tot_bwd_pkts, totlen_fwd_pkts, totlen_bwd_pkts, fwd_pkt_len_max, fwd_pkt_len_min, fwd_pkt_len_mean, fwd_pkt_len_std, bwd_pkt_len_max, bwd_pkt_len_min, bwd_pkt_len_mean, bwd_pkt_len_std, flow_byts/s, flow_pkts/s, flow_iat_mean, flow_iat_std, flow_iat_max, flow_iat_min, fwd_iat_tot, fwd_iat_mean, fwd_iat_std, fwd_iat_max, fwd_iat_min, bwd_iat_tot, bwd_iat_mean, bwd_iat_std, bwd_iat_max, bwd_iat_min, fwd_psh_flags, bwd_psh_flags, fwd_urg_flags, bwd_urg_flags, fwd_header_len, bwd_header_len, fwd_pkts/s, bwd_pkts/s, pkt_len_min, pkt_len_max, pkt_len_mean, pkt_len_std, pkt_len_var, fin_flag_cnt, syn_flag_cnt, rst_flag_cnt, psh_flag_cnt, ack_flag_cnt, urg_flag_cnt, cwe_flag_count, ece_flag_cnt, down/up_ratio, pkt_size_avg, fwd_seg_size_avg, bwd_seg_size_avg, fwd_byts/b_avg, fwd_pkts/b_avg, fwd_blk_rate_avg, bwd_byts/b_avg, bwd_pkts/b_avg, bwd_blk_rate_avg, subflow_fwd_pkts, subflow_fwd_byts, subflow_bwd_pkts

In [15]:
# Assuming you're loading from a CSV file
data = pd.read_csv('CSE-CIC-IDS2018.csv')
print("Data shape after loading:", data.shape)


Data shape after loading: (1048575, 80)


In [16]:
# Check if any filtering is done before this step
print("Data shape before any preprocessing:", data.shape)


Data shape before any preprocessing: (1048575, 80)


In [17]:
print("Numerical columns:", numerical_cols)


Numerical columns: Index(['dst_port', 'protocol', 'flow_duration', 'tot_fwd_pkts', 'tot_bwd_pkts',
       'totlen_fwd_pkts', 'totlen_bwd_pkts', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_mean', 'fwd_pkt_len_std',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean',
       'bwd_pkt_len_std', 'flow_byts/s', 'flow_pkts/s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_tot',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags',
       'bwd_urg_flags', 'fwd_header_len', 'bwd_header_len', 'fwd_pkts/s',
       'bwd_pkts/s', 'pkt_len_min', 'pkt_len_max', 'pkt_len_mean',
       'pkt_len_std', 'pkt_len_var', 'fin_flag_cnt', 'syn_flag_cnt',
       'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt',
       'cwe_flag_count', 'ece_flag_cnt', 'down/up_ratio', 'pkt_size_avg',


In [18]:
print("Data before preprocessing:", data.shape)
data = data.dropna()  # Ensure this doesn't remove all rows
print("Data after dropping NaN values:", data.shape)


Data before preprocessing: (1048575, 80)
Data after dropping NaN values: (1048575, 80)


In [19]:
# Check if any filtering condition was applied
print("Data after applying filter (if any):", data.shape)


Data after applying filter (if any): (1048575, 80)


In [21]:
print("Columns in the DataFrame:", data.columns)

Columns in the DataFrame: Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Rati

In [22]:
# Standardize column names by converting to lowercase and removing spaces
data.columns = data.columns.str.lower().str.replace(' ', '_')

# Standardize the numerical column names to match
numerical_cols = [col.lower().replace(' ', '_') for col in numerical_cols]

# Check columns in the DataFrame after modification
print("Columns in the DataFrame after standardization:", data.columns)


Columns in the DataFrame after standardization: Index(['dst_port', 'protocol', 'timestamp', 'flow_duration', 'tot_fwd_pkts',
       'tot_bwd_pkts', 'totlen_fwd_pkts', 'totlen_bwd_pkts', 'fwd_pkt_len_max',
       'fwd_pkt_len_min', 'fwd_pkt_len_mean', 'fwd_pkt_len_std',
       'bwd_pkt_len_max', 'bwd_pkt_len_min', 'bwd_pkt_len_mean',
       'bwd_pkt_len_std', 'flow_byts/s', 'flow_pkts/s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_tot',
       'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min',
       'bwd_iat_tot', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max',
       'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags',
       'bwd_urg_flags', 'fwd_header_len', 'bwd_header_len', 'fwd_pkts/s',
       'bwd_pkts/s', 'pkt_len_min', 'pkt_len_max', 'pkt_len_mean',
       'pkt_len_std', 'pkt_len_var', 'fin_flag_cnt', 'syn_flag_cnt',
       'rst_flag_cnt', 'psh_flag_cnt', 'ack_flag_cnt', 'urg_flag_cnt',
       'cwe_flag_count', 'ece_fl

In [23]:
# Verify if the standardized numerical columns exist in the DataFrame
numerical_cols = [col for col in numerical_cols if col in data.columns]

# Apply StandardScaler if numerical columns exist
if data.shape[0] > 0 and numerical_cols:
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
    print("Numerical features standardized.")
else:
    print("No rows or numerical columns available for standardization.")


Numerical features standardized.


In [24]:
# Select only numeric columns for variance threshold
numeric_data = data.select_dtypes(include=['number'])

# Remove low-variance features
selector = VarianceThreshold(threshold=0.01)
numeric_data = pd.DataFrame(selector.fit_transform(numeric_data), 
                            columns=[col for col, keep in zip(numeric_data.columns, selector.get_support()) if keep])

# Now, update the original DataFrame with the filtered numeric data
data = data[numeric_data.columns]

# Remove highly correlated features
correlation_matrix = data.corr().abs()
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.9)]
data = data.drop(columns=to_drop)

# Print the new shape after feature removal
print(f"Shape after removing low-variance and correlated features: {data.shape}")


Shape after removing low-variance and correlated features: (1048575, 20)


In [25]:
print("Final dataset summary:")
print(data.info())

Final dataset summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 20 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   dst_port         1048575 non-null  float64
 1   protocol         1048575 non-null  float64
 2   tot_fwd_pkts     1048575 non-null  float64
 3   tot_bwd_pkts     1048575 non-null  float64
 4   fwd_pkt_len_min  1048575 non-null  float64
 5   bwd_pkt_len_min  1048575 non-null  float64
 6   flow_byts/s      1048575 non-null  float64
 7   flow_pkts/s      1048575 non-null  float64
 8   flow_iat_mean    1048575 non-null  float64
 9   flow_iat_std     1048575 non-null  float64
 10  flow_iat_min     1048575 non-null  float64
 11  bwd_iat_tot      1048575 non-null  float64
 12  bwd_iat_mean     1048575 non-null  float64
 13  bwd_iat_min      1048575 non-null  float64
 14  fwd_psh_flags    1048575 non-null  float64
 15  bwd_pkts/s       1048575 non-null  float64


In [26]:
print(data.head())

   dst_port   protocol  tot_fwd_pkts  tot_bwd_pkts  fwd_pkt_len_min  \
0 -0.717368  -0.058424     -0.039153      1.144112        -0.028397   
1 -0.701923  16.875478     -0.039561     -0.886451        55.662668   
2 -0.701923  16.875478     -0.039561     -0.886451        55.662668   
3 -0.701923  16.875478     -0.039153     -0.886451        55.662668   
4 -0.701923  16.875478     -0.039153     -0.886451        55.662668   

   bwd_pkt_len_min  flow_byts/s  flow_pkts/s  flow_iat_mean  flow_iat_std  \
0         -0.00247    -0.634935    -0.368136      -0.034505     -0.029354   
1         -0.00247    -0.652227    -0.414674      83.265445     54.617894   
2         -0.00247    -0.652227    -0.414674      83.265445     54.617894   
3         -0.00247    -0.652196    -0.414669      35.236045     78.043462   
4         -0.00247    -0.652196    -0.414669      35.236045     78.043462   

   flow_iat_min  bwd_iat_tot  bwd_iat_mean  bwd_iat_min  fwd_psh_flags  \
0     -0.026600     0.052022      0.