In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import RFE

from sklearn.preprocessing import FunctionTransformer 

In [3]:

df = pd.read_csv("../dataset/cybersecurity_attacks.csv")
df.head()

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,...,Action Taken,Severity Level,User Information,Device Information,Network Segment,Geo-location Data,Proxy Information,Firewall Logs,IDS/IPS Alerts,Log Source
0,2023-05-30 06:33:58,103.216.15.12,84.9.164.252,31225,17616,ICMP,503,Data,HTTP,Qui natus odio asperiores nam. Optio nobis ius...,...,Logged,Low,Reyansh Dugal,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment A,"Jamshedpur, Sikkim",150.9.97.135,Log Data,,Server
1,2020-08-26 07:08:30,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,...,Blocked,Low,Sumer Rana,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment B,"Bilaspur, Nagaland",,Log Data,,Firewall
2,2022-11-13 08:23:25,63.79.210.48,198.219.82.17,16811,53600,UDP,306,Control,HTTP,Perferendis sapiente vitae soluta. Hic delectu...,...,Ignored,Low,Himmat Karpe,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Segment C,"Bokaro, Rajasthan",114.133.48.179,Log Data,Alert Data,Firewall
3,2023-07-02 10:38:46,163.42.196.10,101.228.192.255,20018,32534,UDP,385,Data,HTTP,Totam maxime beatae expedita explicabo porro l...,...,Blocked,Medium,Fateh Kibe,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_11_5; ...,Segment B,"Jaunpur, Rajasthan",,,Alert Data,Firewall
4,2023-07-16 13:11:07,71.166.185.76,189.243.174.238,6131,26646,TCP,1462,Data,DNS,Odit nesciunt dolorem nisi iste iusto. Animi v...,...,Blocked,Low,Dhanush Chad,Mozilla/5.0 (compatible; MSIE 5.0; Windows NT ...,Segment C,"Anantapur, Tripura",149.6.110.119,,Alert Data,Firewall


In [4]:
df.columns

Index(['Timestamp', 'Source IP Address', 'Destination IP Address',
       'Source Port', 'Destination Port', 'Protocol', 'Packet Length',
       'Packet Type', 'Traffic Type', 'Payload Data', 'Malware Indicators',
       'Action Taken', 'Severity Level', 'User Information',
       'Device Information', 'Network Segment', 'Geo-location Data',
       'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts', 'Log Source'],
      dtype='object')

In [5]:
#Select relevant features and target
# features = ['Protocol', 'Traffic Type' ,
#             'Malware Indicators', 'Anomaly Scores', 'Alerts/Warnings', 
#             'Severity Level', 'IDS/IPS Alerts','Action Taken','Attack Signature','Source Port','Destination Port']
target = 'Attack Type'
to_drop_cols = [target,'Attack Type','Payload Data','User Information','Device Information',
             'Geo-location Data','Proxy Information','Source IP Address','Destination IP Address','Timestamp']
x = df.drop(to_drop_cols, axis=1) 
y = df[target]
x

Unnamed: 0,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,Attack Signature,Action Taken,Severity Level,Network Segment,Firewall Logs,IDS/IPS Alerts,Log Source
0,31225,17616,ICMP,503,Data,HTTP,IoC Detected,28.67,,Known Pattern B,Logged,Low,Segment A,Log Data,,Server
1,17245,48166,ICMP,1174,Data,HTTP,IoC Detected,51.50,,Known Pattern A,Blocked,Low,Segment B,Log Data,,Firewall
2,16811,53600,UDP,306,Control,HTTP,IoC Detected,87.42,Alert Triggered,Known Pattern B,Ignored,Low,Segment C,Log Data,Alert Data,Firewall
3,20018,32534,UDP,385,Data,HTTP,,15.79,Alert Triggered,Known Pattern B,Blocked,Medium,Segment B,,Alert Data,Firewall
4,6131,26646,TCP,1462,Data,DNS,,0.52,Alert Triggered,Known Pattern B,Blocked,Low,Segment C,,Alert Data,Firewall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,31005,6764,UDP,1428,Control,HTTP,IoC Detected,39.28,,Known Pattern A,Logged,Medium,Segment A,Log Data,Alert Data,Firewall
39996,2553,28091,UDP,1184,Control,HTTP,IoC Detected,27.25,,Known Pattern A,Logged,High,Segment C,Log Data,,Firewall
39997,22505,25152,UDP,1043,Data,DNS,IoC Detected,31.01,,Known Pattern B,Blocked,Low,Segment C,Log Data,Alert Data,Server
39998,20013,2703,UDP,483,Data,FTP,IoC Detected,97.85,Alert Triggered,Known Pattern B,Ignored,Low,Segment B,Log Data,,Server


In [6]:
x['Alerts/Warnings'] = x['Alerts/Warnings'].notna().astype(int)
x['IDS/IPS Alerts'] = x['IDS/IPS Alerts'].notna().astype(int)
x['Malware Indicators'] = x['Malware Indicators'].notna().astype(int)
x['Firewall Logs'] = x['Firewall Logs'].notna().astype(int)
x

Unnamed: 0,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,Attack Signature,Action Taken,Severity Level,Network Segment,Firewall Logs,IDS/IPS Alerts,Log Source
0,31225,17616,ICMP,503,Data,HTTP,1,28.67,0,Known Pattern B,Logged,Low,Segment A,1,0,Server
1,17245,48166,ICMP,1174,Data,HTTP,1,51.50,0,Known Pattern A,Blocked,Low,Segment B,1,0,Firewall
2,16811,53600,UDP,306,Control,HTTP,1,87.42,1,Known Pattern B,Ignored,Low,Segment C,1,1,Firewall
3,20018,32534,UDP,385,Data,HTTP,0,15.79,1,Known Pattern B,Blocked,Medium,Segment B,0,1,Firewall
4,6131,26646,TCP,1462,Data,DNS,0,0.52,1,Known Pattern B,Blocked,Low,Segment C,0,1,Firewall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,31005,6764,UDP,1428,Control,HTTP,1,39.28,0,Known Pattern A,Logged,Medium,Segment A,1,1,Firewall
39996,2553,28091,UDP,1184,Control,HTTP,1,27.25,0,Known Pattern A,Logged,High,Segment C,1,0,Firewall
39997,22505,25152,UDP,1043,Data,DNS,1,31.01,0,Known Pattern B,Blocked,Low,Segment C,1,1,Server
39998,20013,2703,UDP,483,Data,FTP,1,97.85,1,Known Pattern B,Ignored,Low,Segment B,1,0,Server


In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
# categorical_features = ['Protocol','Attack Signature','Network Segment',
#                         'Traffic Type', 'Severity Level','Action Taken','Log Source','Packet Type']
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[
         ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)
# Temporary preprocessing for RFE
X_train_processed = preprocessor.fit_transform(X_train)

In [9]:

selector = RFE(LogisticRegression(max_iter=1000), n_features_to_select=10)
selector.fit(X_train_processed, y_train)
# Get selected feature indices
selected_indices = selector.support_

# Map back to original feature names
# First, retrieve feature names after preprocessing
preprocessor_feature_names = (
    numerical_features + 
    list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
)

# Get the selected feature names
selected_features = [preprocessor_feature_names[i] for i in range(len(preprocessor_feature_names)) if selected_indices[i]]
print("Selected Features by RFE:", selected_features)



In [10]:
def filter_features(X):
    return X[:, selector.support_]

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', FunctionTransformer(filter_features)),  # Keep RFE-selected features
    ('classifier', LogisticRegression(max_iter=10000, random_state=42))
])

In [11]:


model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

y_pred_train = model.predict(X_train)
print("Accuracy Score on training set:", accuracy_score(y_train, y_pred_train))
print("Accuracy Score on testing set:", accuracy_score(y_test, y_pred))

Accuracy Score on training set: 0.34715625
Accuracy Score on testing set: 0.3435
