In [None]:
# import the dataset

train = pd.read_csv('../input/nslkdd-dataset/KDDTrain.csv')
test = pd.read_csv('../input/nslkdd-dataset/KDDTest.csv')

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# obtaining a new target variable for each attack class

attack_classes = ['back', 'buffer_overflow', 'ftp_write', 'guess_passwd', 'imap', 'ipsweep', 'land', 
                  'loadmodule', 'multihop', 'neptune', 'nmap', 'normal', 'perl', 'phf', 'pod', 'portsweep',
                  'rootkit', 'satan', 'smurf', 'spy', 'teardrop', 'warezclient', 'warezmaster']

train_label = pd.DataFrame()
test_label = pd.DataFrame()

for attack_type in attack_classes:
    train_label[attack_type] = train['attack_class'].apply(lambda x : int(x == attack_type))
    test_label[attack_type] = test['attack_class'].apply(lambda x : int(x == attack_type))

In [None]:
# extracting numerical labels from categorical data

encoder = LabelEncoder()

train['protocol_type_label'] = encoder.fit_transform(train['protocol_type'])
test['protocol_type_label'] = encoder.fit_transform(test['protocol_type'])

train['service_label'] = encoder.fit_transform(train['service'])
test['service_label'] = encoder.fit_transform(test['service'])

train['flag_label'] = encoder.fit_transform(train['flag'])
test['flag_label'] = encoder.fit_transform(test['flag'])

In [None]:
# removing useless columns

train.drop(['attack_class', 'num_learners'], axis = 1, inplace = True)
test.drop(['attack_class', 'num_learners'], axis = 1, inplace = True)

In [None]:
# creating dataframes for storing training data for stacked model

stacked_train_df = {}
stacked_test_df = {}

for attack_type in attack_classes:
    stacked_train_df[attack_type] = pd.DataFrame()
    stacked_test_df[attack_type] = pd.DataFrame()

In [None]:
# preparing data for training on models

x_train = train.copy(deep = True)
x_train.drop(['protocol_type', 'service', 'flag'], axis = 1, inplace = True)

x_test = test.copy(deep = True)
x_test.drop(['protocol_type', 'service', 'flag'], axis = 1, inplace = True)

In [None]:
# logistic regression classifier

def getLRClf():
    clf = LogisticRegression(C = 0.2, solver = 'liblinear')
    return clf

In [None]:
# training on logistic regression classifier

lr_accuracy = []

for attack_type in attack_classes:
    clf = getLRClf()
    clf.fit(x_train, train_label[attack_type])
    y_pred = clf.predict(x_test)
    stacked_train_df[attack_type]['logistic_regression'] = clf.predict(x_train)
    stacked_test_df[attack_type]['logistic_regression'] = y_pred
    lr_accuracy += [accuracy_score(test_label[attack_type], y_pred)]
    
mean_lr_accuracy = np.mean(lr_accuracy)
    
print("Logistic Regression Classifier...")
print("Mean Accuracy score : " + str(mean_lr_accuracy))

In [None]:
# XGBoost classifier

import xgboost as xgb

def getxgbclf(d_train, eval_list):
    params = {'booster' : 'gbtree', 'nthread' : 4, 'eta' : 0.2, 'max_depth' : 6, 'min_child_weight' : 4,
          'subsample' : 0.7, 'colsample_bytree' : 0.7, 'objective' : 'binary:logistic'}

    clf = xgb.train(params, d_train, num_boost_round = 300, early_stopping_rounds = 100, 
                    evals = evallist, verbose_eval = False)
    return clf

In [None]:
# training on XGBoost classifier

xgb_accuracy = []

for attack_type in attack_classes:
    d_train = xgb.DMatrix(x_train, label = train_label[attack_type])
    d_test = xgb.DMatrix(x_test, label = test_label[attack_type])
    evallist = [(d_train, 'train'), (d_test, 'valid')]
    clf = getxgbclf(d_train, evallist)
    y_pred = (clf.predict(d_test) >= 0.5).astype(int)
    stacked_train_df[attack_type]['xgb'] = (clf.predict(d_train) >= 0.5).astype(int)
    stacked_test_df[attack_type]['xgb'] = y_pred
    xgb_accuracy += [accuracy_score(test_label[attack_type], y_pred)]
    
mean_xgb_accuracy = np.mean(xgb_accuracy)
    
print("XGBoost Classifier...")
print("Mean Accuracy score : " + str(mean_xgb_accuracy))