In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
train = pd.read_csv('/content/drive/My Drive/Data/KDDtrain_updated.csv')
test = pd.read_csv('/content/drive/My Drive/Data/KDDtest_updated.csv')

In [4]:
train = train.drop(columns="Unnamed: 0")
test = test.drop(columns='Unnamed: 0')

In [5]:
X_train=train[train.columns.difference(['attack'])]
y_train=train['attack']
X_test=test[test.columns.difference(['attack'])]
y_test=test['attack']

In [6]:
# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Build step forward feature selection
sfs1 = sfs(clf,
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  7.3min finished

[2020-10-06 11:22:55] Features: 1/5 -- score: 0.9258571057161618[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   42.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  39 out of  39 | elapsed: 16.4min finished

[2020-10-06 11:39:17] Features: 2/5 -- score: 0.9857350393374483[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   28.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  38 out of  38 | elapsed: 15.9min finished

[2020-10-06 11:55:13] Features: 3/5 -- score: 0.9919268385642555[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 

In [7]:
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[4, 5, 8, 31, 32]


In [15]:
new_train = X_train.iloc[:,feat_cols]
new_train.shape
new_test = X_test.iloc[:,feat_cols]

In [14]:
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(new_train, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [17]:
y_train_pred = clf.predict(new_train)
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(new_test)
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected features: 0.958
Testing accuracy on selected features: 0.696


In [18]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [26]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [20]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train)



In [21]:
model = SelectFromModel(lsvc, prefit=True)

In [28]:
train_new = model.transform(X_train)
test_new = model.transform(X_test)

In [27]:
pipe_lr = Pipeline([('scl', StandardScaler()),('clf', LogisticRegression(penalty='l2', random_state=0))])

In [29]:
pipe_lr = pipe_lr.fit(train_new, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [30]:
from sklearn.metrics import accuracy_score
y_pred_train = pipe_lr.predict(train_new)
y_pred_test = pipe_lr.predict(test_new)

In [31]:
accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)

(0.9660324037690616, 0.7645493257629524)

In [32]:
from sklearn.metrics import classification_report

In [33]:
target_names = ['Normal', 'DoS','Probe','R2L', 'U2R']
print(classification_report(y_train, y_pred_train, target_names=target_names))

              precision    recall  f1-score   support

      Normal       0.97      0.97      0.97     67343
         DoS       0.99      0.99      0.99     45927
       Probe       0.89      0.84      0.87     11656
         R2L       0.73      0.65      0.69       995
         U2R       0.82      0.35      0.49        52

    accuracy                           0.97    125973
   macro avg       0.88      0.76      0.80    125973
weighted avg       0.97      0.97      0.97    125973



In [34]:
print(classification_report(y_test, y_pred_test, target_names=target_names))

              precision    recall  f1-score   support

      Normal       0.67      0.97      0.79      9900
         DoS       0.97      0.82      0.88      7165
       Probe       0.86      0.73      0.79      2421
         R2L       0.47      0.00      0.01      2991
         U2R       0.82      0.13      0.23        67

    accuracy                           0.76     22544
   macro avg       0.76      0.53      0.54     22544
weighted avg       0.76      0.76      0.71     22544

