In [1]:
import pandas as pd
import numpy as np

In [2]:
cic_train = pd.read_csv("/home/sharedrive/nafi/trafficp3/cicidis/datasets/CICIDS_clean_train.csv")
cic_test = pd.read_csv("/home/sharedrive/nafi/trafficp3/cicidis/datasets/CICIDS_clean_test.csv")
cic_train.shape, cic_test.shape

((1979513, 25), (848363, 25))

In [3]:
cic_train.drop(columns=["Init_Win_bytes_forward", "Flow_IAT_Min"], inplace=True)
cic_test.drop(columns=["Init_Win_bytes_forward", "Flow_IAT_Min"], inplace=True)

In [4]:
cic_train.shape, cic_test.shape

((1979513, 23), (848363, 23))

In [5]:
def replace_negatives_with_positive_mean(df):
    df_copy = df.copy()
    
    for col in df_copy.columns:
        if col == "Label":
            continue
        if pd.api.types.is_numeric_dtype(df_copy[col]):
            # calculate the mean of positive values in the column
            positive_values = df_copy[col][df_copy[col] >= 0]
            if not positive_values.empty:
                positive_mean = positive_values.mean()
                # replace negatives with the mean of positive values
                df_copy.loc[df_copy[col] < 0, col] = positive_mean
            else:
                print(f"Column '{col}' has no positive values. Negatives remain unchanged.")
    
    return df_copy

cic_train = replace_negatives_with_positive_mean(cic_train)

import pandas as pd
from tqdm import tqdm

def analyze_negative_values(df, show_values=False, sample=10):
    # Select only numeric columns to optimize processing
    numeric_cols = df.select_dtypes(include=['number']).columns
    summary = []
    
    # Use tqdm for progress bar (only in Jupyter)
    for col in tqdm(numeric_cols, desc="Processing columns"):
        negative_count = (df[col] < 0).sum()
        result = {"Feature": col, "Negative Count": negative_count}
        
        if show_values and negative_count > 0:
            negatives = df[col][df[col] < 0]
            result["Sample Values"] = negatives.head(sample).tolist()
        
        summary.append(result)
    
    return pd.DataFrame(summary)

analyze_negative_values(cic_train)

Processing columns: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 436.15it/s]


Unnamed: 0,Feature,Negative Count
0,Flow_Duration,0
1,Total_Length_of_Bwd_Packets,0
2,Bwd_Packet_Length_Max,0
3,Bwd_Packet_Length_Mean,0
4,Flow_Bytes/s,0
5,Flow_Packets/s,0
6,Flow_IAT_Mean,0
7,Flow_IAT_Std,0
8,Flow_IAT_Max,0
9,Fwd_IAT_Total,0


In [6]:
cic_test = replace_negatives_with_positive_mean(cic_test)
analyze_negative_values(cic_test)

Processing columns: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 928.38it/s]


Unnamed: 0,Feature,Negative Count
0,Flow_Duration,0
1,Total_Length_of_Bwd_Packets,0
2,Bwd_Packet_Length_Max,0
3,Bwd_Packet_Length_Mean,0
4,Flow_Bytes/s,0
5,Flow_Packets/s,0
6,Flow_IAT_Mean,0
7,Flow_IAT_Std,0
8,Flow_IAT_Max,0
9,Fwd_IAT_Total,0


In [7]:
cic_train.to_csv("/home/sharedrive/nafi/trafficp3/cicidis/datasets/clean_cicids_neg_train.csv", index=False)


In [8]:
cic_test.to_csv("/home/sharedrive/nafi/trafficp3/cicidis/datasets/clean_cicids_neg_test.csv", index=False)