In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

# Load the dataset (replace with your actual dataset)
dataset = pd.read_csv('your_data.csv')

# Split the dataset into features (X) and target (y)
X = dataset.drop('label', axis=1)  # assuming 'label' is the target column
y = dataset['label']

# Initialize LabelEncoder for the target column
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # Encode the target labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize LabelEncoders for categorical columns
protocol_type_encoder = LabelEncoder()
service_encoder = LabelEncoder()
flag_encoder = LabelEncoder()

# Encode categorical columns in the training set
X_train['protocol_type'] = protocol_type_encoder.fit_transform(X_train['protocol_type'])
X_train['service'] = service_encoder.fit_transform(X_train['service'])
X_train['flag'] = flag_encoder.fit_transform(X_train['flag'])

# Transform categorical columns in the testing set (without refitting)
X_test['protocol_type'] = protocol_type_encoder.transform(X_test['protocol_type'])
X_test['service'] = service_encoder.transform(X_test['service'])
X_test['flag'] = flag_encoder.transform(X_test['flag'])

# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))

# Ensure the 'models' directory exists
if not os.path.exists('models'):
    os.makedirs('models')

# Save the trained model
joblib.dump(model, 'models/network_intrusion_model.pkl')

# Save the label encoders
joblib.dump(protocol_type_encoder, 'models/protocol_type_encoder.pkl')
joblib.dump(service_encoder, 'models/service_encoder.pkl')
joblib.dump(flag_encoder, 'models/flag_encoder.pkl')
joblib.dump(label_encoder, 'models/label_encoder.pkl')

# Optionally, save the processed data for later use
X_train.to_csv('processed_X_train.csv', index=False)
X_test.to_csv('processed_X_test.csv', index=False)
pd.DataFrame(y_train, columns=['label']).to_csv('processed_y_train.csv', index=False)
pd.DataFrame(y_test, columns=['label']).to_csv('processed_y_test.csv', index=False)


Accuracy: 19.93%
              precision    recall  f1-score   support

           0       0.20      0.23      0.22      6037
           1       0.21      0.20      0.20      5955
           2       0.20      0.20      0.20      6045
           3       0.20      0.19      0.19      5989
           4       0.19      0.17      0.18      5974

    accuracy                           0.20     30000
   macro avg       0.20      0.20      0.20     30000
weighted avg       0.20      0.20      0.20     30000

