In [None]:
# Intrusion Detection System using Decision Tree Algorithm on KDD Cup 1999 Dataset

## Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

## Step 2: Load and Preprocess Dataset
# Load the dataset (adjust the file path as needed)
# Assuming the dataset is in gzipped format and named 'kddcup.data_10_percent.gz'
df = pd.read_csv('/content/drive/MyDrive/Intrusion-Detection-System-master/dataset/kddcup.data_10_percent.gz', compression='gzip', header=None)

# Display the first few rows of the dataset
df.head()

# Assign column names based on the KDD Cup 1999 dataset specification
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]
df.columns = column_names

# Handle missing values (if any)
df = df.dropna()

# Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = ['protocol_type', 'service', 'flag', 'label']
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Separate features and target variable
X = df.drop('label', axis=1)
y = df['label']

# Standardize the feature values
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Step 3: Split the Dataset
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Step 4: Train the Decision Tree Model
# Initialize the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model
dt_classifier.fit(X_train, y_train)

## Step 5: Evaluate the Model
# Predict the labels for the test set
y_pred = dt_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Print classification report
unique_classes = np.unique(y_test)
target_names = label_encoder.inverse_transform(unique_classes)
print(classification_report(y_test, y_pred, target_names=target_names))

## Step 6: Visualize the Results
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Feature Importance
feature_importances = dt_classifier.feature_importances_
features = column_names[:-1]
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance')
plt.show()


In [None]:
# Intrusion Detection System using Random Forest Algorithm on KDD Cup 1999 Dataset

## Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

## Step 2: Load and Preprocess Dataset
# Load the dataset (adjust the file path as needed)
# Assuming the dataset is in gzipped format and named 'kddcup.data_10_percent.gz'
df = pd.read_csv('/content/drive/MyDrive/Intrusion-Detection-System-master/dataset/kddcup.data_10_percent.gz', compression='gzip', header=None)

# Display the first few rows of the dataset
df.head()

# Assign column names based on the KDD Cup 1999 dataset specification
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"
]
df.columns = column_names

# Handle missing values (if any)
df = df.dropna()

# Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = ['protocol_type', 'service', 'flag', 'label']
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Separate features and target variable
X = df.drop('label', axis=1)
y = df['label']

# Standardize the feature values
scaler = StandardScaler()
X = scaler.fit_transform(X)

## Step 3: Split the Dataset
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Step 4: Train the Random Forest Model
# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

## Step 5: Evaluate the Model
# Predict the labels for the test set
y_pred = rf_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print evaluation metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

# Print classification report
unique_classes = np.unique(y_test)
target_names = label_encoder.inverse_transform(unique_classes)
print(classification_report(y_test, y_pred, target_names=target_names))

## Step 6: Visualize the Results
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Feature Importance
feature_importances = rf_classifier.feature_importances_
features = column_names[:-1]
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importance')
plt.show()
