In [1]:
import pandas as pd
x = pd.read_csv("kdd_train.csv")

In [2]:
x["labels"].unique()

array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back',
       'guess_passwd', 'ftp_write', 'multihop', 'rootkit',
       'buffer_overflow', 'imap', 'warezmaster', 'phf', 'land',
       'loadmodule', 'spy', 'perl'], dtype=object)

In [3]:
x["labels"] = x["labels"].replace(['neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back',
       'guess_passwd', 'ftp_write', 'multihop', 'rootkit',
       'buffer_overflow', 'imap', 'warezmaster', 'phf', 'land',
       'loadmodule', 'spy', 'perl'], 'attack')

In [4]:
x["labels"].unique()

array(['normal', 'attack'], dtype=object)

In [5]:
x["labels"] = x["labels"].map({'normal':0, 'attack':1})

In [6]:
x["labels"].unique()

array([0, 1], dtype=int64)

In [7]:
x.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'labels'],
      dtype='object')

In [8]:
X = x.iloc[:, :-1].values

In [9]:
Y = x.iloc[:, 41].values

In [10]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x_1 = LabelEncoder()
labelencoder_x_2 = LabelEncoder()
labelencoder_x_3 = LabelEncoder()
X[:, 1] = labelencoder_x_1.fit_transform(X[:, 1])
X[:, 2] = labelencoder_x_2.fit_transform(X[:, 2])
X[:, 3] = labelencoder_x_3.fit_transform(X[:, 3])

In [11]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print('Original dataset shape {}'.format(Counter(Y)))
print('Training dataset shape {}'.format(Counter(Y_train)))

Original dataset shape Counter({0: 67343, 1: 58630})
Training dataset shape Counter({0: 45150, 1: 39251})


In [13]:
from imblearn.datasets import make_imbalance
X_rs, Y_rs = make_imbalance(X_train, Y_train, sampling_strategy={1:1000, 0:65}, random_state=0)
print('Random undersampling {}'.format(Counter(Y_rs)))

Random undersampling Counter({1: 1000, 0: 65})


In [14]:
X_rs.shape

(1065, 41)

In [15]:
from imblearn.under_sampling import (RandomUnderSampler, ClusterCentroids, TomekLinks, NeighbourhoodCleaningRule, NearMiss)

In [16]:
sampler = RandomUnderSampler()
X_rs1, Y_rs1 = sampler.fit_resample(X_train, Y_train)
print('Random undersampling {}'.format(Counter(Y_rs1)))

Random undersampling Counter({0: 39251, 1: 39251})


In [17]:
X_rs1.shape

(78502, 41)

In [18]:
sampler = ClusterCentroids(sampling_strategy={1:1000, 0:1000})
X_rs2, Y_rs2 = sampler.fit_resample(X_train, Y_train)
print('Cluster Centroid undersampling {}'.format(Counter(Y_rs2)))

Cluster Centroid undersampling Counter({0: 1000, 1: 1000})


In [19]:
X_rs2.shape

(2000, 41)

In [20]:
sampler = TomekLinks()
X_rs3, Y_rs3 = sampler.fit_resample(X_train, Y_train)
print('Tomeklinks undersampling {}'.format(Counter(Y_rs3)))

Tomeklinks undersampling Counter({0: 45118, 1: 39251})


In [21]:
X_rs3.shape

(84369, 41)

In [22]:
sampler = NeighbourhoodCleaningRule()
X_rs4, Y_rs4 = sampler.fit_resample(X_train, Y_train)
print('Neighbourhood Cleaning Rule undersampling {}'.format(Counter(Y_rs4)))

Neighbourhood Cleaning Rule undersampling Counter({0: 44638, 1: 39251})


In [23]:
X_rs4.shape

(83889, 41)

In [24]:
sampler = NearMiss()
X_rs5, Y_rs5 = sampler.fit_resample(X_train, Y_train)
print('NearMiss {}'.format(Counter(Y_rs5)))

NearMiss Counter({0: 39251, 1: 39251})


In [25]:
X_rs5.shape

(78502, 41)

In [26]:
from imblearn.over_sampling import(RandomOverSampler, SMOTE, ADASYN)

In [27]:
sampler = RandomOverSampler()
X_rs6, Y_rs6 = sampler.fit_resample(X_train, Y_train)
print('Random Over Sampler {}'.format(Counter(Y_rs6)))

Random Over Sampler Counter({1: 45150, 0: 45150})


In [28]:
X_rs6.shape

(90300, 41)

In [29]:
sampler = SMOTE()
X_rs7, Y_rs7 = sampler.fit_resample(X_train, Y_train)
print('SMOTE Over Sampling {}'.format(Counter(Y_rs7)))

SMOTE Over Sampling Counter({1: 45150, 0: 45150})


In [30]:
X_rs7.shape

(90300, 41)

In [31]:
sampler = ADASYN()
X_rs8, Y_rs8 = sampler.fit_resample(X_train, Y_train)
print('ADASYN {}'.format(Counter(Y_rs8)))

ADASYN Counter({1: 45227, 0: 45150})


In [32]:
X_rs8.shape

(90377, 41)