In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif

df = pd.read_csv('/home/ibibers@ads.iu.edu/IDS_Datasets/Combined_datasets/Simargl_cleaned_dataset.csv')

print("Target class distribution:")
print(df['ALERT'].value_counts())


Target class distribution:
ALERT
Normal               15049330
Denial of Service     5138973
Port Scanning         4170194
Malware                   571
Name: count, dtype: int64


In [None]:
X = df.drop(columns=['ALERT'])  
y = df['ALERT']               

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
def select_top_features(X, y, k=None):
    """
    Selects and ranks features based on mutual information (Information Gain).

    Args:
        X: DataFrame of input features.
        y: Series of target variable.
        k: Number of top features to select (optional).

    
    """
    ig_scores = mutual_info_classif(X, y)
    
    feature_scores = {feature: score for feature, score in zip(X.columns, ig_scores)}
    
    sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
    
    if k:
        return [feature for feature, score in sorted_features[:k]]  
    return sorted_features


sorted_features = select_top_features(X_train, y_train)
print ("\n the all top features are:", sorted_features)



 the all top features are: [('IPV4_SRC_ADDR', 0.9484074028909807), ('IN_BYTES', 0.8572953589108459), ('TOTAL_FLOWS_EXP', 0.8482002133077475), ('FLOW_ID', 0.8481959373596015), ('IPV4_DST_ADDR', 0.8395692342244765), ('LAST_SWITCHED', 0.8050392656537886), ('TCP_FLAGS', 0.7626107967291005), ('FIRST_SWITCHED', 0.737106851877946), ('ANALYSIS_TIMESTAMP', 0.7293230837113438), ('L4_SRC_PORT', 0.6388551386116517), ('TCP_WIN_MAX_IN', 0.5894881960929212), ('L4_DST_PORT', 0.5830188209539671), ('TCP_WIN_MIN_IN', 0.5726510161444471), ('FLOW_DURATION_MILLISECONDS', 0.5592412484104601), ('OUT_BYTES', 0.4895510694402043), ('TCP_WIN_MSS_IN', 0.4253780357595669), ('OUT_PKTS', 0.3418995030081262), ('IN_PKTS', 0.33858377761085867), ('TCP_WIN_SCALE_IN', 0.32283428366581224), ('PROTOCOL', 0.2035181432752886), ('PROTOCOL_MAP', 0.18493254077178922), ('TCP_WIN_MAX_OUT', 0.1361896946237291), ('TCP_WIN_MIN_OUT', 0.1359983533647695), ('TCP_WIN_SCALE_OUT', 0.06438243701562363), ('SRC_TOS', 0.05368797434904349), ('D

In [1]:
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

In [4]:
data = pd.read_csv('/home/ibibers/IDS Project/IDS_Datasets/Combined_datasets/Simargelpreprocessed_dataset_with_original_labels.csv')


X = data.drop(columns=['ALERT']) 
y = data['ALERT']  

In [5]:
information_gain = mutual_info_classif(X, y)

ig_scores_df = pd.DataFrame({
    'Feature': X.columns,
    'Information Gain': information_gain
})

ig_scores_df = ig_scores_df.sort_values(by='Information Gain', ascending=False)

print("Information Gain Scores:")
print(ig_scores_df)

Information Gain Scores:
                       Feature  Information Gain
3                IPV4_SRC_ADDR          0.854286
10                   TCP_FLAGS          0.833675
5                IPV4_DST_ADDR          0.803937
21                    IN_BYTES          0.645404
0                      FLOW_ID          0.627127
20             TOTAL_FLOWS_EXP          0.627119
11              TCP_WIN_MAX_IN          0.621813
16            TCP_WIN_SCALE_IN          0.621714
13              TCP_WIN_MIN_IN          0.621680
7   FLOW_DURATION_MILLISECONDS          0.600924
8                LAST_SWITCHED          0.596458
6               FIRST_SWITCHED          0.520416
25          ANALYSIS_TIMESTAMP          0.515824
4                  L4_DST_PORT          0.507542
15              TCP_WIN_MSS_IN          0.500394
9                     PROTOCOL          0.466596
23                   OUT_BYTES          0.462142
24                    OUT_PKTS          0.451065
1                 PROTOCOL_MAP          0.39

In [6]:
k = 10  
top_features = ig_scores_df['Feature'].head(k).tolist()

print("Top Features Selected:")
print(top_features)

Top Features Selected:
['IPV4_SRC_ADDR', 'TCP_FLAGS', 'IPV4_DST_ADDR', 'IN_BYTES', 'FLOW_ID', 'TOTAL_FLOWS_EXP', 'TCP_WIN_MAX_IN', 'TCP_WIN_SCALE_IN', 'TCP_WIN_MIN_IN', 'FLOW_DURATION_MILLISECONDS']
