In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from joblib import dump
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('maindata.csv')

In [4]:
data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.00,0.00,0.00,0.0,0.0,normal
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,tcp,http,SF,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal
494016,0,tcp,http,SF,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal
494017,0,tcp,http,SF,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal
494018,0,tcp,http,SF,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal


In [None]:
print("First few rows of the dataset:")
print(data.head())

In [None]:
print("Summary statistics of numerical features:")
print(data.describe())

In [None]:
print("Data types of each column:")
print(data.dtypes)

In [None]:
print("Missing values in the dataset:")
print(data.isnull().sum())

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='label', data=data, palette='Set2')
plt.title('Distribution of Attack Types')
plt.xlabel('Attack Type')
plt.xticks(rotation=90)
plt.ylabel('Count')
plt.show()

In [None]:
corr_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True, fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
subset_features = ['duration', 'src_bytes', 'dst_bytes', 'count', 'srv_count']
sns.pairplot(data[subset_features], kind='scatter')
plt.suptitle('Pairplot of Selected Features', y=1.02)
plt.show()

In [5]:
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

In [6]:
data = pd.get_dummies(data, columns=['protocol_type', 'service', 'flag'])

In [7]:
X = data.drop('label', axis=1)
y = data['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
svm_model = SVC(kernel='linear', random_state=2)
svm_model.fit(X_train, y_train)

In [None]:
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_classification_report = classification_report(y_test, svm_predictions)

In [None]:
print("SVM Model Evaluation:")
print("Accuracy:", svm_accuracy)
print(svm_classification_report)

In [None]:
dump(svm_model, 'SVM.joblib')

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, max_features=3)
rf_model.fit(X_train, y_train)

In [None]:
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_classification_report = classification_report(y_test, rf_predictions)

In [None]:
print("Random Forest Model Evaluation:")
print("Accuracy:", rf_accuracy)
print(rf_classification_report)

In [None]:
dump(rf_model, 'RF.joblib')