In [1]:
import math
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from joblib import Parallel, delayed
import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv("kddcup99_csv.csv")
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal


In [3]:
data['protocol_type']=data['protocol_type'].replace({'tcp':1,'udp':2,'icmp':3})
data['flag']=data['flag'].replace({'SF':1,'S1':2,'REJ':3,'S2':4,'S0':5,'S3':6,'RSTO':7,'RSTR':8,'RSTOS0':9,'OTH':10,'SH':11})
data['service']=data['service'].replace({'http':1, 'smtp':2, 'finger':3, 'domain_u':4, 'auth':5, 'telnet':6, 'ftp':7,
       'eco_i':8, 'ntp_u':9, 'ecr_i':10, 'other':11, 'private':12, 'pop_3':13, 'ftp_data':14,
       'rje':15, 'time':16, 'mtp':17, 'link':18, 'remote_job':19, 'gopher':20, 'ssh':21,
       'name':22, 'whois':23, 'domain':24, 'login':25, 'imap4':26, 'daytime':27, 'ctf':28,
       'nntp':29, 'shell':30, 'IRC':31, 'nnsp':32, 'http_443':33, 'exec':34, 'printer':35,
       'efs':36, 'courier':37, 'uucp':38, 'klogin':39, 'kshell':40, 'echo':41, 'discard':42,
       'systat':43, 'supdup':44, 'iso_tsap':45, 'hostnames':46, 'csnet_ns':47, 'pop_2':48,
       'sunrpc':49, 'uucp_path':50, 'netbios_ns':51, 'netbios_ssn':52, 'netbios_dgm':53,
       'sql_net':54, 'vmnet':55, 'bgp':56, 'Z39_50':57, 'ldap':58, 'netstat':59, 'urh_i':60,
       'X11':61, 'urp_i':62, 'pm_dump':63, 'tftp_u':64, 'tim_i':65, 'red_i':66})
data['label']=data['label'].replace({'normal':1,'buffer_overflow':2, 'loadmodule':3, 'perl':4, 'neptune':5,
       'smurf':6, 'guess_passwd':7, 'pod':8, 'teardrop':9, 'portsweep':10, 'ipsweep':11,
       'land':12, 'ftp_write':13, 'back':14, 'imap':15, 'satan':16, 'phf':17, 'nmap':18,
       'multihop':19, 'warezmaster':20, 'warezclient':21, 'spy':22, 'rootkit':23})

In [4]:
X=data[['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'lnum_compromised', 'lroot_shell',
       'lsu_attempted', 'lnum_root', 'lnum_file_creations', 'lnum_shells',
       'lnum_access_files  ', 'lnum_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate']]
y=data['label']
train_X,test_X,train_y,test_y=train_test_split(X,y,test_size=0.4,random_state=32)

In [5]:
nb_classifier = GaussianNB()
knn_model = KNeighborsClassifier(n_neighbors=5)
rf_model = RandomForestClassifier()
dt_model = DecisionTreeClassifier()

In [7]:
def train_model_and_get_accuracy(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    #return accuracy,precision,recall,f1
    return 1

In [8]:
start_time = time.time()
results = Parallel(n_jobs=4)(delayed(train_model_and_get_accuracy)(model, train_X, test_X, train_y, test_y) 
                             for model in [nb_classifier, knn_model, rf_model, dt_model])
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.4f} seconds")

Elapsed time: 331.5651 seconds


In [10]:
print("nb_classifier Accuracy:", results[0])
print("KNN Accuracy:", results[1])
print("Random Forest Accuracy:", results[2])
print("Decision Tree Accuracy:", results[3])

nb_classifier Accuracy: (0.9354428970487024, 0.3102047759673811, 0.4432741119134973, 0.2976170200683296)
KNN Accuracy: (0.9983907534107931, 0.7101996553979858, 0.6166631605219289, 0.6468311709462763)
Random Forest Accuracy: (0.9997166106635359, 0.801443589201973, 0.7013188545563972, 0.7358237269330026)
Decision Tree Accuracy: (0.9994737055179952, 0.7124146715120723, 0.7189215381949771, 0.7058841726440725)
