# 1. Import libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


# 2. Load raw extracted CSV data

In [None]:
raw_data_path = '../data/extracted_csv/sample_raw_network_data.csv'
df = pd.read_csv(raw_data_path)
print("Raw Data Sample:")
display(df.head())

# 3. Preprocessing function

In [None]:
def preprocess(df, categorical_cols):
    df = df.dropna()
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])
    X = df.drop('label', axis=1)
    y = df['label'].apply(lambda x: 0 if x == 'normal' else 1)  # Encode labels
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

categorical_columns = ['protocol_type', 'service', 'flag']
X, y = preprocess(df, categorical_columns)


# 4. Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Train individual models

In [None]:
lr = LogisticRegression(max_iter=500)
rf = RandomForestClassifier()
svm = SVC(probability=True)
knn = KNeighborsClassifier()

lr.fit(X_train, y_train)
rf.fit(X_train, y_train)
svm.fit(X_train, y_train)
knn.fit(X_train, y_train)


# 6. Ensemble model

In [None]:
ensemble = VotingClassifier(estimators=[
    ('lr', lr),
    ('rf', rf),
    ('svm', svm),
    ('knn', knn)
], voting='soft')
ensemble.fit(X_train, y_train)


# 7. Evaluate models

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    # Plot confusion matrix heatmap
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Attack'], yticklabels=['Normal', 'Attack'])
    plt.title(f'Confusion Matrix - {model_name}')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    print("\n")

for model, name in [(lr, "Logistic Regression"),
                    (rf, "Random Forest"),
                    (svm, "SVM"),
                    (knn, "KNN"),
                    (ensemble, "Ensemble Voting Classifier")]:
    evaluate_model(model, X_test, y_test, name)


# 8. Save your ensemble model (optional)

In [None]:
import joblib
joblib.dump(ensemble, '../models/final_ids_model.pkl')
print("Model saved to ../models/final_ids_model.pkl")
