In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score


In [6]:
# Load Dataset
df = pd.read_csv("/content/02-14-2018.csv")  # Replace with actual dataset
#df2 = pd.read_csv("/content/02-15-2018.csv")
#df = pd.concat([df1, df2], ignore_index=True)

# Display the first few rows
print(df.head())

   Dst Port  Protocol            Timestamp  Flow Duration  Tot Fwd Pkts  \
0         0         0  14/02/2018 08:31:01      112641719             3   
1         0         0  14/02/2018 08:33:50      112641466             3   
2         0         0  14/02/2018 08:36:39      112638623             3   
3        22         6  14/02/2018 08:40:13        6453966            15   
4        22         6  14/02/2018 08:40:23        8804066            14   

   Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  \
0             0                0                0                0   
1             0                0                0                0   
2             0                0                0                0   
3            10             1239             2273              744   
4            11             1143             2209              744   

   Fwd Pkt Len Min  ...  Fwd Seg Size Min  Active Mean  Active Std  \
0                0  ...                 0          0.0    

In [7]:
# Exploratory Data Analysis (EDA)
print("Dataset Shape:", df.shape)
print(df.info())
print(df.describe())
print("Missing Values:")
print(df.isnull().sum())


Dataset Shape: (1048575, 80)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 80 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   Dst Port           1048575 non-null  int64  
 1   Protocol           1048575 non-null  int64  
 2   Timestamp          1048575 non-null  object 
 3   Flow Duration      1048575 non-null  int64  
 4   Tot Fwd Pkts       1048575 non-null  int64  
 5   Tot Bwd Pkts       1048575 non-null  int64  
 6   TotLen Fwd Pkts    1048575 non-null  int64  
 7   TotLen Bwd Pkts    1048575 non-null  int64  
 8   Fwd Pkt Len Max    1048575 non-null  int64  
 9   Fwd Pkt Len Min    1048575 non-null  int64  
 10  Fwd Pkt Len Mean   1048575 non-null  float64
 11  Fwd Pkt Len Std    1048575 non-null  float64
 12  Bwd Pkt Len Max    1048575 non-null  int64  
 13  Bwd Pkt Len Min    1048575 non-null  int64  
 14  Bwd Pkt Len Mean   1048575 non-null  float64
 15  Bwd

In [8]:
# Convert Timestamp to datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
print(df.head())


   Dst Port  Protocol           Timestamp  Flow Duration  Tot Fwd Pkts  \
0         0         0 2018-02-14 08:31:01      112641719             3   
1         0         0 2018-02-14 08:33:50      112641466             3   
2         0         0 2018-02-14 08:36:39      112638623             3   
3        22         6 2018-02-14 08:40:13        6453966            15   
4        22         6 2018-02-14 08:40:23        8804066            14   

   Tot Bwd Pkts  TotLen Fwd Pkts  TotLen Bwd Pkts  Fwd Pkt Len Max  \
0             0                0                0                0   
1             0                0                0                0   
2             0                0                0                0   
3            10             1239             2273              744   
4            11             1143             2209              744   

   Fwd Pkt Len Min  ...  Fwd Seg Size Min  Active Mean  Active Std  \
0                0  ...                 0          0.0         0

  df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')


In [18]:
# Extract useful time-based features
df['Hour'] = df['Timestamp'].dt.hour
df['Minute'] = df['Timestamp'].dt.minute
df['Second'] = df['Timestamp'].dt.second
df.drop(columns=['Timestamp'], inplace=True)  # Drop original Timestamp column


In [19]:
df.head()

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Hour,Minute,Second
0,0,0,112641719,3,0,0,0,0,0,0.0,...,0,0,56320859.5,139.300036,56320958,56320761,Benign,8,31,1
1,0,0,112641466,3,0,0,0,0,0,0.0,...,0,0,56320733.0,114.551299,56320814,56320652,Benign,8,33,50
2,0,0,112638623,3,0,0,0,0,0,0.0,...,0,0,56319311.5,301.934596,56319525,56319098,Benign,8,36,39
3,22,6,6453966,15,10,1239,2273,744,0,82.6,...,0,0,0.0,0.0,0,0,Benign,8,40,13
4,22,6,8804066,14,11,1143,2209,744,0,81.642857,...,0,0,0.0,0.0,0,0,Benign,8,40,23


In [20]:
df["Label"]

Unnamed: 0,Label
0,Benign
1,Benign
2,Benign
3,Benign
4,Benign
...,...
1048570,Benign
1048571,Benign
1048572,Benign
1048573,Benign


In [21]:

# Handling Missing Values
num_cols = df.select_dtypes(include=['number']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mode())  # Fill numeric columns with median



In [22]:
num_cols

Index(['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
       'Fwd Seg Siz

In [23]:
df.head()

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Hour,Minute,Second
0,0,0,112641719,3,0,0,0,0,0,0.0,...,0,0,56320859.5,139.300036,56320958,56320761,Benign,8,31,1
1,0,0,112641466,3,0,0,0,0,0,0.0,...,0,0,56320733.0,114.551299,56320814,56320652,Benign,8,33,50
2,0,0,112638623,3,0,0,0,0,0,0.0,...,0,0,56319311.5,301.934596,56319525,56319098,Benign,8,36,39
3,22,6,6453966,15,10,1239,2273,744,0,82.6,...,0,0,0.0,0.0,0,0,Benign,8,40,13
4,22,6,8804066,14,11,1143,2209,744,0,81.642857,...,0,0,0.0,0.0,0,0,Benign,8,40,23


In [24]:

cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.fillna(x.mode()[0]))  # Fill categorical columns with mode

In [25]:
cat_cols

Index(['Label'], dtype='object')

In [26]:
# Encoding Categorical Variables
le = LabelEncoder()
categorical_cols = ['Protocol', 'Label']  # Update as needed
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Selecting Features for Early Classification
selected_features = ['Flow Duration', 'Protocol', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Max', 'Bwd Pkt Len Max', 'Flow Byts/s', 'Flow Pkts/s', 'Hour', 'Minute', 'Second']
X = df[selected_features]
y = df['Label']  # Target column (multiclass classification)

# Splitting Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handling Infinite and Large Values
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill remaining NaN values with median (prevents missing values in scaling)
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_train.median(), inplace=True)  # Use training median to prevent data leakage

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [27]:

# Model Training
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "LogisticRegression": LogisticRegression()
}

best_model = None
best_f1 = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')  # Handle multiclass classification
    print(f"{name} F1-Score: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = model

# Save the Best Model
model_filename = "best_nids_model.pkl"
with open(model_filename, "wb") as file:
    pickle.dump(best_model, file)

print(f"\nBest Model: {type(best_model).__name__} with F1-Score: {best_f1:.4f}")

# Load and Predict on New Data
def predict_intrusion(new_data):
    with open(model_filename, "rb") as file:
        loaded_model = pickle.load(file)

    new_data = scaler.transform(new_data)  # Scale input data
    prediction = loaded_model.predict(new_data)
    return "Attack" if prediction[0] != 0 else "Normal"

# Example New Session Data (Modify accordingly)
new_session = np.array([[5000, 6, 10, 20, 1000, 800, 500000.0, 100, 8, 30, 45]])  # Example input
intrusion_result = predict_intrusion(new_session)
print("Predicted Network Status:", intrusion_result)


RandomForest F1-Score: 0.9945


Parameters: { "use_label_encoder" } are not used.



XGBoost F1-Score: 0.9952
LogisticRegression F1-Score: 0.9516

Best Model: XGBClassifier with F1-Score: 0.9952
Predicted Network Status: Normal


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
