In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib

# Load your dataset
data = pd.read_csv('Train_data.csv')

# Check the columns to ensure 'same_srv_count' is present
print("Columns in the dataset:", data.columns)

# Define the feature columns and target
feature_columns = ['duration', 'src_bytes', 'dst_bytes', 'logged_in', 
                   'wrong_fragment', 'same_srv_rate', 'srv_count', 
                   'protocol_type', 'service', 'flag']  
target_column = 'class'  # Assuming your target column is named 'class'

# Prepare features and target
X = data[feature_columns]
y = data[target_column]

# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, columns=['protocol_type', 'service', 'flag'], drop_first=True)

# Save the list of columns after encoding for later use
encoded_columns = X_encoded.columns.tolist()

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train your models
# Logistic Regression
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_scaled, y_train)
joblib.dump(logistic_model, 'logistic_regression_model.pkl')

# Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
joblib.dump(random_forest_model, 'random_forest_model.pkl')

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)
joblib.dump(svm_model, 'svm_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Save the encoded column names for inference
with open('encoded_columns.txt', 'w') as f:
    for col in encoded_columns:
        f.write(f"{col}\n")

# Model Evaluation
def evaluate_model(model, X_test, y_test):
    accuracy = model.score(X_test, y_test)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Evaluate models
evaluate_model(logistic_model, X_test_scaled, y_test)
evaluate_model(random_forest_model, X_test, y_test)
evaluate_model(svm_model, X_test_scaled, y_test)


Columns in the dataset: Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'class'],
      dtype='object')
Model Accuracy: 96.69%
Model Accuracy: 99.66%
Model Accuracy: 96.73%
