In [1]:
import os
import pandas as pd
from pathlib import Path

#  Updated column list with Sport and Dport
columns_ds = [
    "StartTime", "Dur", "Proto", "SrcAddr", "Sport",
    "DstAddr", "Dport", "State", "TotPkts", "TotBytes", "SrcBytes", "Label"
]

def load_datasets(directory: str, columns_ds: list[str]) -> pd.DataFrame:
    datasets = []

    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".binetflow"):
                datasets.append(Path(root) / file)
                break  # only takes the first .binetflow file from each subdirectory

    df = pd.concat(
        [pd.read_csv(f, usecols=columns_ds) for f in datasets],
        ignore_index=True
    )

    df = df[df['Proto'].isin(["tcp", "udp", "icmp"])]
    df = df[df['State'].isin(["FSPA_FSPA", "URP", "CON"])]

    return df

input_directory = "datasets/CTU-13-Dataset/"
output_file = Path("datasets/all-data-filtered-12col.csv.gz")

if not output_file.is_file():
    print("Filtered dataset not found. Generating it now...")
    df = load_datasets(input_directory, columns_ds)
    df.to_csv(output_file, index=False, compression='gzip', sep='\t')
    del df
else:
    print("The dataset has already been created.")

# Load sample for analysis
total_rows_to_read = 100_000_000
df = pd.read_csv(output_file, usecols=columns_ds, compression='gzip', sep='\t', nrows=total_rows_to_read)
print(f"Loaded {len(df)} rows.")


The dataset has already been created.
Loaded 16452474 rows.


In [2]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16452474 entries, 0 to 16452473
Data columns (total 12 columns):
 #   Column     Dtype  
---  ------     -----  
 0   StartTime  object 
 1   Dur        float64
 2   Proto      object 
 3   SrcAddr    object 
 4   Sport      object 
 5   DstAddr    object 
 6   Dport      object 
 7   State      object 
 8   TotPkts    int64  
 9   TotBytes   int64  
 10  SrcBytes   int64  
 11  Label      object 
dtypes: float64(1), int64(3), object(8)
memory usage: 1.5+ GB
None


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, SelectFpr, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# === Load the dataset ===
df = pd.read_csv("datasets/all-data-filtered-12col.csv.gz", sep='\t', compression='gzip')

# === Encode categorical features ===
protocol_enabled = {"tcp": 0, "udp": 1, "icmp": 2}
state_mapping = {"fspa_fspa": 0, "urp": 1, "con": 2}

df['Proto'] = df['Proto'].str.lower().map(protocol_enabled)
df['State'] = df['State'].str.lower().map(state_mapping)
df['Label'] = df['Label'].apply(lambda x: 1 if 'botnet' in x.lower() else 0)


# === Drop non-numeric columns ===
df = df.select_dtypes(include=[np.number])

# === Separate features and target ===
X = df.drop(columns=['Label'])
y = df['Label']

# === Normalize features for chi2 ===
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# === Filter classes with at least 2 samples for stratification ===
class_counts = y.value_counts()
valid_classes = class_counts[class_counts >= 2].index
mask = y.isin(valid_classes)

X_filtered = X_scaled.loc[mask]
y_filtered = y.loc[mask]

# === Sample sizes to test ===
sample_sizes = [5000, 10000, 50000]
k = 5  # Select top 5 features

# === Store selected features for each method ===
results = {
    'SelectKBest': [],
    'SelectFpr': [],
    'RFE': []
}

for size in sample_sizes:
    # Adjust sample size if larger than available data
    if size > len(y_filtered):
        print(f"Requested sample size {size} is greater than dataset size {len(y_filtered)}. Using full dataset.")
        size = len(y_filtered)

    X_sample, _, y_sample, _ = train_test_split(
        X_filtered, y_filtered, train_size=size, stratify=y_filtered, random_state=42
    )

    # 1. SelectKBest
    skb = SelectKBest(score_func=chi2, k=k).fit(X_sample, y_sample)
    results['SelectKBest'].append(set(X_sample.columns[skb.get_support()]))

    # 2. SelectFpr
    fpr = SelectFpr(score_func=chi2, alpha=0.05).fit(X_sample, y_sample)
    results['SelectFpr'].append(set(X_sample.columns[fpr.get_support()]))

    # 3. RFE with RandomForest
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(model, n_features_to_select=k).fit(X_sample, y_sample)
    results['RFE'].append(set(X_sample.columns[rfe.get_support()]))

# === Intersect features across sample sizes for each method ===
intersections = {
    method: set.intersection(*feature_sets) for method, feature_sets in results.items()
}

# === Final feature set: intersection of all methods ===
final_features = set.intersection(*intersections.values())

print("\nIntersection of selected features across methods and sample sizes:")
print(final_features)

# === If you want the best 8 features overall (union of all selections ranked by frequency) ===
from collections import Counter

# Count feature frequencies from all selections
feature_counter = Counter()
for method_sets in results.values():
    for feature_set in method_sets:
        feature_counter.update(feature_set)

# Pick top 8 most frequently selected features
best_8_features = [feat for feat, count in feature_counter.most_common(8)]

print("\nBest 8 features selected overall:")
print(best_8_features)



Intersection of selected features across methods and sample sizes:
{'Dur'}

Best 8 features selected overall:
['Dur', 'Proto', 'TotBytes', 'TotPkts', 'SrcBytes', 'State']


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16452474 entries, 0 to 16452473
Data columns (total 7 columns):
 #   Column    Dtype  
---  ------    -----  
 0   Dur       float64
 1   Proto     int64  
 2   State     int64  
 3   TotPkts   int64  
 4   TotBytes  int64  
 5   SrcBytes  int64  
 6   Label     int64  
dtypes: float64(1), int64(6)
memory usage: 878.7 MB
None
