In [1]:
import pandas as pd
import numpy as np

The original dataset files for the KDDCup99 can be found here: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

The original dataset files for the NSL-KDD can be found here: https://www.unb.ca/cic/datasets/nsl.html

**KDDCup99**

In [5]:
'''Reading in the data and adding the relevant columns '''
just_for_columns = pd.read_pickle('kddcup99dataframe')

KDD_data_10_percent = pd.read_csv('kddcup.data_10_percent.gz') #this is the one we want
KDD_data_10_percent.columns = list(just_for_columns.columns)
KDD_data_10_percent_x_train, KDD_data_10_percent_y_train = KDD_data_10_percent.iloc[:,:KDD_data_10_percent.shape[1]-1], KDD_data_10_percent.iloc[:,KDD_data_10_percent.shape[1]-1]

corrected = pd.read_csv('corrected.gz') #this is the test data
corrected.columns = list(just_for_columns.columns)
corrected_x_train, corrected_y_train = corrected.iloc[:,:corrected.shape[1]-1], corrected.iloc[:,corrected.shape[1]-1]

kddcup_newtestdata_10_percent_unlabeled = pd.read_csv('kddcup.newtestdata_10_percent_unlabeled.gz')
kddcup_newtestdata_10_percent_unlabeled.columns = list(just_for_columns.columns)[0:kddcup_newtestdata_10_percent_unlabeled.shape[1]]
kddcup_newtestdata_10_percent_unlabeled_x_train, kddcup_newtestdata_10_percent_unlabeled_y_train = kddcup_newtestdata_10_percent_unlabeled.iloc[:,:kddcup_newtestdata_10_percent_unlabeled.shape[1]-1], kddcup_newtestdata_10_percent_unlabeled.iloc[:,kddcup_newtestdata_10_percent_unlabeled.shape[1]-1]

In [6]:
frames = [KDD_data_10_percent, corrected]
combined_df = pd.concat(frames)
combined_df_all_numerical = pd.get_dummies(combined_df)

In [7]:
KDD_data_10_percent_dummy,corrected_dummy = combined_df_all_numerical.iloc[0:KDD_data_10_percent.shape[0], :], combined_df_all_numerical.iloc[KDD_data_10_percent.shape[0]:, :]

In [8]:
KDD_data_10_percent_dummy_x_train = KDD_data_10_percent_dummy.iloc[:,0:119]
corrected_dummy_x_test = corrected_dummy.iloc[:,0:119]
''' This data is the 23 dimension one hot encoded vector for each intrusion type'''
KDD_data_10_percent_dummy_y_train = KDD_data_10_percent_dummy.iloc[:,119:]
corrected_dummy_y_test = corrected_dummy.iloc[:,119:] 

In [9]:
KDD_data_10_percent_y_train.head()

0    normal.
1    normal.
2    normal.
3    normal.
4    normal.
Name: label, dtype: object

In [10]:
''' creation of the binary classification and 4-way classification datasets '''

intrusion_name_to_type = {
    'apache2' : 'dos',
    'normal' : 'normal',
    'back' : 'dos',
    'buffer_overflow':'u2r',
    'ftp_write' :'r2l',
    'guess_passwd': 'r2l',
    'httptunnel' : 'r2l',
    'imap': 'r2l',
    'ipsweep' :'probe',
    'land': 'dos',
    'loadmodule': 'u2r',
    'mailbomb' : 'dos',
    'mscan' : 'probe',
    'multihop': 'r2l',
    'named' : 'r2l',
    'neptune': 'dos',
    'nmap': 'probe',
    'ps' : 'u2r',
    'perl': 'u2r',
    'phf' :'r2l',
    'pod':'dos',
    'processtable' : 'dos',
    'portsweep': 'probe',
    'rootkit': 'u2r',
    'saint' : 'probe',
    'satan' :'probe',
    'sendmail' : 'r2l',
    'smurf': 'dos',
    'snmpguess' : 'r2l',
    'snmpgetattack' : 'r2l',
    'sqlattack' : 'u2r',
    'spy': 'r2l',
    'teardrop': 'dos',
    'udpstorm' : 'dos',
    'worm' : 'dos',
    'warezclient': 'r2l',
    'warezmaster': 'r2l',
    'xlock' : 'r2l',
    'xsnoop' : 'r2l',
    'xterm' : 'u2r',
    'snmpgetattack': 'r2l',
    'named' : 'r2l',
    
}

intrusion_type_to_one_hot_index = {
    'normal' : 0,
    'dos' : 1,
    'r2l' : 2,
    'u2r' : 3,
    'probe' : 4
}

def create_binary_data(original_y_labels):
    """ input should be the list of the y_labels ie. "normal, neptune, etc.
        returns a dataframe. \""""
    
    y_data_binary = []
    
    for row in range(original_y_labels.shape[0]):
        if "normal" in original_y_labels.iloc[row]:
            y_data_binary.append(0)
        else:
            y_data_binary.append(1)
    
    y_data_binary = np.asarray(y_data_binary)
    y_data_binary_df = pd.DataFrame(y_data_binary)
    y_data_binary_df.columns = ["label"]
    
    return y_data_binary_df

def create_attack_type_data(original_y_labels):
    """ input should be the list of the y_labels ie. "normal, neptune, etc."
        returns a dataframe. """
    
    y_data_attack_type = []
    
    for row in range(original_y_labels.shape[0]):
        one_hot = [0, 0, 0, 0, 0]
        intrusion_type = intrusion_name_to_type[original_y_labels[row][:-1]]
        one_hot_index = intrusion_type_to_one_hot_index[intrusion_type]
        one_hot[one_hot_index] = 1
        y_data_attack_type.append(one_hot)
    
    y_data_attack_type = np.asarray(y_data_attack_type)
    y_data_attack_type_df = pd.DataFrame(y_data_attack_type)
    y_data_attack_type_df.columns = ["normal", "dos", "r2l", "u2r", "probe"]
    
    return y_data_attack_type_df

In [11]:
KDD_data_10_percent_dummy_y_train_binary = create_binary_data(KDD_data_10_percent_y_train)
corrected_dummy_y_test_binary = create_binary_data(corrected_y_train)

In [13]:
KDD_data_10_percent_dummy_y_train_attacktype = create_attack_type_data(KDD_data_10_percent_y_train)
corrected_dummy_y_test_attacktype = create_attack_type_data(corrected_y_train)

In [14]:
list(KDD_data_10_percent_dummy_y_train.columns)

['label_apache2.',
 'label_back.',
 'label_buffer_overflow.',
 'label_ftp_write.',
 'label_guess_passwd.',
 'label_httptunnel.',
 'label_imap.',
 'label_ipsweep.',
 'label_land.',
 'label_loadmodule.',
 'label_mailbomb.',
 'label_mscan.',
 'label_multihop.',
 'label_named.',
 'label_neptune.',
 'label_nmap.',
 'label_normal.',
 'label_perl.',
 'label_phf.',
 'label_pod.',
 'label_portsweep.',
 'label_processtable.',
 'label_ps.',
 'label_rootkit.',
 'label_saint.',
 'label_satan.',
 'label_sendmail.',
 'label_smurf.',
 'label_snmpgetattack.',
 'label_snmpguess.',
 'label_spy.',
 'label_sqlattack.',
 'label_teardrop.',
 'label_udpstorm.',
 'label_warezclient.',
 'label_warezmaster.',
 'label_worm.',
 'label_xlock.',
 'label_xsnoop.',
 'label_xterm.']

In [15]:
''' time to pickle all the data files '''

#by train I mean like the entire dataset, but i think I will use this as train??
KDD_data_10_percent_dummy_x_train.to_pickle('KDD_x_train_dummy.pkl')
KDD_data_10_percent_dummy_y_train.to_pickle('KDD_y_train_col_40.pkl')
KDD_data_10_percent_dummy_y_train_binary.to_pickle('KDD_y_train_col_1.pkl')
KDD_data_10_percent_dummy_y_train_attacktype.to_pickle('KDD_y_train_col_5.pkl')

corrected_dummy_x_test.to_pickle('KDD_x_test_dummy.pkl')
corrected_dummy_y_test.to_pickle('KDD_y_test_col_40.pkl')
corrected_dummy_y_test_binary.to_pickle('KDD_y_test_col_1.pkl')
corrected_dummy_y_test_attacktype.to_pickle('KDD_y_test_col_5.pkl')

**NSL-KDD**

In [18]:
NSL_train = pd.read_csv('KDDTrain+.txt')
NSL_train.head()

Unnamed: 0,0,tcp,ftp_data,SF,491,0.1,0.2,0.3,0.4,0.5,...,0.17.1,0.03,0.17.2,0.00.6,0.00.7,0.00.8,0.05,0.00.9,normal,20
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


In [2]:
'''Reading in the data and adding the relevant columns '''
just_for_columns = pd.read_pickle('kddcup99dataframe')

NSL_train = pd.read_csv('KDDTrain+.txt')
difficulty_train = NSL_train.iloc[:, NSL_train.shape[1]-1]
NSL_train = NSL_train.drop(list(NSL_train.columns)[NSL_train.shape[1]-1], axis=1)
NSL_train.columns = list(just_for_columns.columns)
NSL_x_train, NSL_y_train = NSL_train.iloc[:, :NSL_train.shape[1]-1], NSL_train.iloc[:, NSL_train.shape[1]-1]

NSL_test = pd.read_csv('KDDTest+.txt')
difficulty_test = NSL_test.iloc[:, NSL_test.shape[1]-1]
NSL_test = NSL_test.drop(list(NSL_test.columns)[NSL_test.shape[1]-1], axis=1)
NSL_test.columns = list(just_for_columns.columns)
NSL_x_test, NSL_y_test = NSL_test.iloc[:, :NSL_test.shape[1]-1], NSL_test.iloc[:, NSL_test.shape[1]-1]


NSL_test_21 = pd.read_csv('KDDTest-21.txt')
difficulty_test_21 = NSL_test_21.iloc[:, NSL_test_21.shape[1]-1]
NSL_test_21 = NSL_test_21.drop(list(NSL_test_21.columns)[NSL_test_21.shape[1]-1], axis=1)
NSL_test_21.columns = list(just_for_columns.columns)
NSL_x_test_21, NSL_y_test_21 = NSL_test_21.iloc[:, :NSL_test_21.shape[1]-1], NSL_test_21.iloc[:, NSL_test_21.shape[1]-1]

In [5]:
NSL_y_test_21.shape

(11849,)

In [6]:
frames = [NSL_train, NSL_test, NSL_test_21]
combined_df = pd.concat(frames)
combined_df_all_numerical = pd.get_dummies(combined_df)

In [7]:
NSL_train_dummy, NSL_test_dummy, NSL_test_21_dummy = combined_df_all_numerical.iloc[0:NSL_train.shape[0], :], combined_df_all_numerical.iloc[NSL_train.shape[0]:NSL_train.shape[0]+NSL_test.shape[0], :], combined_df_all_numerical.iloc[NSL_train.shape[0]+NSL_test.shape[0]:, :]

In [8]:
list(NSL_train_dummy.columns).index('flag_SH')

121

In [9]:
NSL_x_train_dummy = NSL_train_dummy.iloc[:,0:122]
NSL_x_test_dummy = NSL_test_dummy.iloc[:,0:122]
NSL_x_test_21_dummy = NSL_test_21_dummy.iloc[:,0:122]
''' This data is the 23 dimension one hot encoded vector for each intrusion type'''
NSL_y_train_dummy = NSL_train_dummy.iloc[:,122:]
NSL_y_test_dummy = NSL_test_dummy.iloc[:,122:]
NSL_y_test_21_dummy = NSL_test_21_dummy.iloc[:,122:] 

In [11]:
NSL_x_test_21_dummy.shape

(11849, 122)

In [12]:
NSL_y_test_21_dummy.shape

(11849, 40)

In [14]:
''' creation of the binary classification and 4-way classification datasets '''

intrusion_name_to_type = {
    'apache2' : 'dos',
    'normal' : 'normal',
    'back' : 'dos',
    'buffer_overflow':'u2r',
    'ftp_write' :'r2l',
    'guess_passwd': 'r2l',
    'httptunnel' : 'r2l',
    'imap': 'r2l',
    'ipsweep' :'probe',
    'land': 'dos',
    'loadmodule': 'u2r',
    'mailbomb' : 'dos',
    'mscan' : 'probe',
    'multihop': 'r2l',
    'named' : 'r2l',
    'neptune': 'dos',
    'nmap': 'probe',
    'ps' : 'u2r',
    'perl': 'u2r',
    'phf' :'r2l',
    'pod':'dos',
    'processtable' : 'dos',
    'portsweep': 'probe',
    'rootkit': 'u2r',
    'saint' : 'probe',
    'satan' :'probe',
    'sendmail' : 'r2l',
    'smurf': 'dos',
    'snmpguess' : 'r2l',
    'snmpgetattack' : 'r2l',
    'sqlattack' : 'u2r',
    'spy': 'r2l',
    'teardrop': 'dos',
    'udpstorm' : 'dos',
    'worm' : 'dos',
    'warezclient': 'r2l',
    'warezmaster': 'r2l',
    'xlock' : 'r2l',
    'xsnoop' : 'r2l',
    'xterm' : 'u2r',
    'snmpgetattack': 'r2l',
    'named' : 'r2l',
    
}

intrusion_type_to_one_hot_index = {
    'normal' : 0,
    'dos' : 1,
    'r2l' : 2,
    'u2r' : 3,
    'probe' : 4
}

def create_binary_data(original_y_labels):
    ''' input should be the list of the y_labels ie. "normal, neptune, etc. 
        returns a dataframe. "'''
    
    y_data_binary = []
    
    for row in range(original_y_labels.shape[0]):
        if "normal" in original_y_labels.iloc[row]:
            y_data_binary.append(0)
        else:
            y_data_binary.append(1)
    
    y_data_binary = np.asarray(y_data_binary)
    y_data_binary_df = pd.DataFrame(y_data_binary)
    y_data_binary_df.columns = ["label"]
    
    return y_data_binary_df

def create_attack_type_data(original_y_labels):
    ''' input should be the list of the y_labels ie. "normal, neptune, etc." 
        returns a dataframe. '''
    
    y_data_attack_type = []
    
    for row in range(original_y_labels.shape[0]):
        one_hot = [0, 0, 0, 0, 0]
        intrusion_type = intrusion_name_to_type[original_y_labels[row]] #format this such that you capture the entire label, so do [:-1] for the KDD data
        one_hot_index = intrusion_type_to_one_hot_index[intrusion_type] # and do the entire thing for the NSL data.
        one_hot[one_hot_index] = 1
        y_data_attack_type.append(one_hot)
    
    y_data_attack_type = np.asarray(y_data_attack_type)
    y_data_attack_type_df = pd.DataFrame(y_data_attack_type)
    y_data_attack_type_df.columns = ["normal", "dos", "r2l", "u2r", "probe"]
    
    return y_data_attack_type_df

In [15]:
NSL_y_train_binary = create_binary_data(NSL_y_train)
NSL_y_test_binary = create_binary_data(NSL_y_test)
NSL_y_test_21_binary = create_binary_data(NSL_y_test_21)


NSL_y_train_attacktype = create_attack_type_data(NSL_y_train)
NSL_y_test_attacktype = create_attack_type_data(NSL_y_test)
NSL_y_test_21_attacktype = create_attack_type_data(NSL_y_test_21)

In [16]:
NSL_y_test_21_attacktype.shape

(11849, 5)

In [37]:
''' time to pickle all the data files '''

#by train I mean like the entire dataset, but i think I will use this as train??
NSL_x_train_dummy.to_pickle('NSL_x_train_dummy.pkl')
NSL_y_train_dummy.to_pickle('NSL_y_train_col_40.pkl')
NSL_y_train_binary.to_pickle('NSL_y_train_col_1.pkl')
NSL_y_train_attacktype.to_pickle('NSL_y_train_col_5.pkl')

NSL_x_test_dummy.to_pickle('NSL_x_test_dummy.pkl')
NSL_y_test_dummy.to_pickle('NSL_y_test_col_40.pkl')
NSL_y_test_binary.to_pickle('NSL_y_test_col_1.pkl')
NSL_y_test_attacktype.to_pickle('NSL_y_test_col_5.pkl')

NSL_x_test_21_dummy.to_pickle('NSL_x_test_21_dummy.pkl')
NSL_y_test_21_dummy.to_pickle('NSL_y_test_21_col_40.pkl')
NSL_y_test_21_binary.to_pickle('NSL_y_test_21_col_1.pkl')
NSL_y_test_21_attacktype.to_pickle('NSL_y_test_21_col_5.pkl')