In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn import preprocessing
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import warnings
import shap
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectPercentile, f_classif
warnings.filterwarnings("ignore")

In [42]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [43]:
# Assigning attribute name to dataset
dataset_train = pd.read_csv("KDDTrain+_2.csv", header=None, names = col_names)
dataset_test = pd.read_csv("KDDTest+_2.csv", header=None, names = col_names)

In [None]:
# colums that are categorical and not binary yet: protocol_type (column 2), service (column 3), flag (column 4).
# explore categorical features
print('Training set:')
for col_name in dataset_train.columns:
    if dataset_train[col_name].dtypes == 'object' :
        unique_cat = len(dataset_train[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

#see how distributed the feature service is, it is evenly distributed and therefore we need to make dummies for all.
print()
print('Distribution of categories in service:')
print(dataset_train['service'].value_counts().sort_values(ascending=False).head())

In [None]:
# Test set
print('Test set:')
for col_name in dataset_test.columns:
    if dataset_test[col_name].dtypes == 'object' :
        unique_cat = len(dataset_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

In [46]:
categorical_columns=['protocol_type', 'service', 'flag']
# insert code to get a list of categorical columns into a variable, categorical_columns
categorical_columns=['protocol_type', 'service', 'flag'] 
 # Get the categorical values into a 2D numpy array
dataset_train_categorical_values = dataset_train[categorical_columns]
dataset_test_categorical_values = dataset_test[categorical_columns]

In [None]:
# protocol type
unique_protocol=sorted(dataset_train.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
# service
unique_service=sorted(dataset_train.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
# flag
unique_flag=sorted(dataset_train.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2
print(dumcols)

#do same for test set
unique_service_test=sorted(dataset_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2

In [None]:
#Transform categorical features into numbers using LabelEncoder()
dataset_train_categorical_values_enc=dataset_train_categorical_values.apply(LabelEncoder().fit_transform)
print(dataset_train_categorical_values_enc.head())
# test set
dataset_test_categorical_values_enc=dataset_test_categorical_values.apply(LabelEncoder().fit_transform)

In [49]:
#One-Hot-Encoding¶
enc = OneHotEncoder()
dataset_train_categorical_values_encenc = enc.fit_transform(dataset_train_categorical_values_enc)
dataset_train_cat_data = pd.DataFrame(dataset_train_categorical_values_encenc.toarray(),columns=dumcols)
# test set
dataset_test_categorical_values_encenc = enc.fit_transform(dataset_test_categorical_values_enc)
dataset_test_cat_data = pd.DataFrame(dataset_test_categorical_values_encenc.toarray(),columns=testdumcols)

In [None]:
trainservice=dataset_train['service'].tolist()
testservice= dataset_test['service'].tolist()
difference=list(set(trainservice) - set(testservice))
string = 'service_'
difference=[string + x for x in difference]
difference

In [None]:
for col in difference:
    dataset_test_cat_data[col] = 0
dataset_test_cat_data.shape

In [None]:
#Join encoded categorical dataframe with the non-categorical dataframe
newdf=dataset_train.join(dataset_train_cat_data)
newdf.drop('flag', axis=1, inplace=True)
newdf.drop('protocol_type', axis=1, inplace=True)
newdf.drop('service', axis=1, inplace=True)
# test data
newdf_test=dataset_test.join(dataset_test_cat_data)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)
print(newdf.shape)
print(newdf_test.shape)

In [53]:
# take label column
labeldf=newdf['label']
labeldf_test=newdf_test['label']
# change the label column
newlabeldf=labeldf.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
newlabeldf_test=labeldf_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
# put the new label column back
newdf['label'] = newlabeldf
newdf_test['label'] = newlabeldf_test


In [None]:
to_drop_DoS = [2,3,4]
to_drop_Probe = [1,3,4]
to_drop_R2L = [1,2,4]
to_drop_U2R = [1,2,3]
DoS_df=newdf[~newdf['label'].isin(to_drop_DoS)];
Probe_df=newdf[~newdf['label'].isin(to_drop_Probe)];
R2L_df=newdf[~newdf['label'].isin(to_drop_R2L)];
U2R_df=newdf[~newdf['label'].isin(to_drop_U2R)];

#test
DoS_df_test=newdf_test[~newdf_test['label'].isin(to_drop_DoS)];
Probe_df_test=newdf_test[~newdf_test['label'].isin(to_drop_Probe)];
R2L_df_test=newdf_test[~newdf_test['label'].isin(to_drop_R2L)];
U2R_df_test=newdf_test[~newdf_test['label'].isin(to_drop_U2R)];
print('Train:')
print('Dimensions of DoS:' ,DoS_df.shape)
print('Dimensions of Probe:' ,Probe_df.shape)
print('Dimensions of R2L:' ,R2L_df.shape)
print('Dimensions of U2R:' ,U2R_df.shape)
print('Test:')
print('Dimensions of DoS:' ,DoS_df_test.shape)
print('Dimensions of Probe:' ,Probe_df_test.shape)
print('Dimensions of R2L:' ,R2L_df_test.shape)
print('Dimensions of U2R:' ,U2R_df_test.shape)

In [55]:
# Split dataframes into X & Y
# assign X as a dataframe of feautures and Y as a series of outcome variables
X_DoS = DoS_df.drop(columns='label')
Y_DoS = DoS_df['label']
X_Probe = Probe_df.drop(columns='label')
Y_Probe = Probe_df['label']
X_R2L = R2L_df.drop(columns='label')
Y_R2L = R2L_df['label']
X_U2R = U2R_df.drop(columns='label')
Y_U2R = U2R_df['label']

# test set
X_DoS_test = DoS_df_test.drop(columns='label')
Y_DoS_test = DoS_df_test['label']
X_Probe_test = Probe_df_test.drop(columns='label')
Y_Probe_test = Probe_df_test['label']
X_R2L_test = R2L_df_test.drop(columns='label')
Y_R2L_test = R2L_df_test['label']
X_U2R_test = U2R_df_test.drop(columns='label')
Y_U2R_test = U2R_df_test['label']

In [56]:
colNames=list(X_DoS)
colNames_test=list(X_DoS_test)

In [57]:
scaler1 = preprocessing.StandardScaler().fit(X_DoS)
X_DoS=scaler1.transform(X_DoS) 
scaler2 = preprocessing.StandardScaler().fit(X_Probe)
X_Probe=scaler2.transform(X_Probe) 
scaler3 = preprocessing.StandardScaler().fit(X_R2L)
X_R2L=scaler3.transform(X_R2L) 
scaler4 = preprocessing.StandardScaler().fit(X_U2R)
X_U2R=scaler4.transform(X_U2R) 
# test data
scaler5 = preprocessing.StandardScaler().fit(X_DoS_test)
X_DoS_test=scaler5.transform(X_DoS_test) 
scaler6 = preprocessing.StandardScaler().fit(X_Probe_test)
X_Probe_test=scaler6.transform(X_Probe_test) 
scaler7 = preprocessing.StandardScaler().fit(X_R2L_test)
X_R2L_test=scaler7.transform(X_R2L_test) 
scaler8 = preprocessing.StandardScaler().fit(X_U2R_test)
X_U2R_test=scaler8.transform(X_U2R_test)


In [None]:
X_DoS.std(axis=0)
X_Probe.std(axis=0)
X_R2L.std(axis=0)
X_U2R.std(axis=0)

In [59]:
np.seterr(divide='ignore', invalid='ignore')
selector=SelectPercentile(f_classif, percentile=10)

In [60]:
X_newDoS = selector.fit_transform(X_DoS,Y_DoS)
X_newDoS_test = selector.fit_transform(X_DoS_test,Y_DoS_test)
true=selector.get_support()
newcolindex_DoS=[i for i, x in enumerate(true) if x]
newcolname_DoS=list( colNames[i] for i in newcolindex_DoS )

In [72]:
X_newProbe = selector.fit_transform(X_Probe,Y_Probe)
X_newProbe_test = selector.fit_transform(X_Probe_test,Y_Probe_test)
true=selector.get_support()
newcolindex_Probe=[i for i, x in enumerate(true) if x]
newcolname_Probe=list( colNames[i] for i in newcolindex_Probe )

In [73]:
X_newR2L = selector.fit_transform(X_R2L,Y_R2L)
X_newR2L_test = selector.fit_transform(X_R2L_test,Y_R2L_test)
true=selector.get_support()
newcolindex_R2L=[i for i, x in enumerate(true) if x]
newcolname_R2L=list( colNames[i] for i in newcolindex_R2L)

In [74]:
X_newU2R = selector.fit_transform(X_U2R,Y_U2R)
X_newU2R_test = selector.fit_transform(X_U2R_test,Y_U2R_test)
true=selector.get_support()
newcolindex_U2R=[i for i, x in enumerate(true) if x]
newcolname_U2R=list( colNames[i] for i in newcolindex_U2R)

In [None]:
print('Features selected for DoS:',newcolname_DoS)
print()
print('Features selected for Probe:',newcolname_Probe)
print()
print('Features selected for R2L:',newcolname_R2L)
print()
print('Features selected for U2R:',newcolname_U2R)

In [65]:
clf = GaussianNB()

In [66]:
rfe = RFE(estimator=clf, n_features_to_select=13)
rfe.fit(X_newDoS, Y_DoS.astype(int))
X_rfeDoS=rfe.transform(X_newDoS)
X_rfeDoS_test=rfe.transform(X_newDoS_test)

In [75]:
rfe = RFE(estimator=clf, n_features_to_select=13)
rfe.fit(X_newProbe, Y_Probe.astype(int))
X_rfeProbe=rfe.transform(X_newProbe)
X_rfeProbe_test=rfe.transform(X_newProbe_test)

In [76]:
rfe = RFE(estimator=clf, n_features_to_select=13)
rfe.fit(X_newR2L, Y_R2L.astype(int))
X_rfeR2L=rfe.transform(X_newR2L)
X_rfeR2L_test=rfe.transform(X_newR2L_test)

In [77]:
rfe = RFE(estimator=clf, n_features_to_select=13)
rfe.fit(X_newU2R, Y_U2R.astype(int))
X_rfeU2R=rfe.transform(X_newU2R)
X_rfeU2R_test=rfe.transform(X_newU2R_test)

In [None]:
# all features
clf_DoS=GaussianNB()
clf_Probe=GaussianNB()
clf_R2L=GaussianNB()
clf_U2R=GaussianNB()
clf_DoS.fit(X_DoS, Y_DoS.astype(int))
clf_Probe.fit(X_Probe, Y_Probe.astype(int))
clf_R2L.fit(X_R2L, Y_R2L.astype(int))
clf_U2R.fit(X_U2R, Y_U2R.astype(int))

In [None]:
# selected features
clf_rfeDoS=GaussianNB()
clf_rfeProbe=GaussianNB()
clf_rfeR2L=GaussianNB()
clf_rfeU2R=GaussianNB()
clf_rfeDoS.fit(X_rfeDoS, Y_DoS.astype(int))
clf_rfeProbe.fit(X_rfeProbe, Y_Probe.astype(int))
clf_rfeR2L.fit(X_rfeR2L, Y_R2L.astype(int))
clf_rfeU2R.fit(X_rfeU2R, Y_U2R.astype(int))

In [81]:
# Apply the classifier we trained to the test data (which it has never seen before)(All features)
Y_pred_Dos = clf_DoS.predict(X_DoS_test)
Y_pred_Probe = clf_Probe.predict(X_Probe_test)
Y_pred_R2L = clf_R2L.predict(X_R2L_test)
Y_pred_U2R = clf_U2R.predict(X_U2R_test)

In [82]:
# Apply the classifier we trained to the test data (which it has never seen before)(selected features)
Y_pred_rfeDos = clf_rfeDoS.predict(X_rfeDoS_test)
Y_pred_rfeProbe = clf_rfeProbe.predict(X_rfeProbe_test)
Y_pred_rfeR2L = clf_rfeR2L.predict(X_rfeR2L_test)
Y_pred_rfeU2R = clf_rfeU2R.predict(X_rfeU2R_test)

In [None]:
print("(All features)")
pd.crosstab(Y_DoS_test, Y_pred_Dos, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [None]:
print("(Selected features)")
pd.crosstab(Y_DoS_test, Y_pred_rfeDos, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [None]:
print("(All features)")
pd.crosstab(Y_Probe_test, Y_pred_Probe, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [None]:
print("(Selected features)")
pd.crosstab(Y_Probe_test, Y_pred_rfeProbe, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [None]:
print("(All features)")
pd.crosstab(Y_R2L_test, Y_pred_R2L, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [None]:
print("(Selected features)")
pd.crosstab(Y_R2L_test, Y_pred_rfeR2L, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [None]:
print("(All features)")
pd.crosstab(Y_U2R_test, Y_pred_U2R, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [None]:
print("(Selected features)")
pd.crosstab(Y_U2R_test, Y_pred_rfeU2R, rownames=['Actual attacks'], colnames=['Predicted attacks'])

In [None]:
print("(All Features)")
accuracy = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='f1')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

In [None]:
print("(Selected Features)")
accuracy = cross_val_score(clf_rfeDoS, X_rfeDoS_test, Y_DoS_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_rfeDoS, X_rfeDoS_test, Y_DoS_test, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_rfeDoS, X_rfeDoS_test, Y_DoS_test, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_rfeDoS, X_rfeDoS_test, Y_DoS_test, cv=10, scoring='f1')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

In [None]:
print("(All Features)")
accuracy = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_Probe, X_Probe_test, Y_Probe_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

In [None]:
print("(Selected Features)")
accuracy = cross_val_score(clf_rfeProbe, X_rfeProbe_test, Y_Probe_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_rfeProbe, X_rfeProbe_test, Y_Probe_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_rfeProbe, X_rfeProbe_test, Y_Probe_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_rfeProbe, X_rfeProbe_test, Y_Probe_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

In [None]:
print("(All Features)")
accuracy = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_R2L, X_R2L_test, Y_R2L_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

In [None]:
print("(Selected Features)")
accuracy = cross_val_score(clf_rfeR2L, X_rfeR2L_test, Y_R2L_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_rfeR2L, X_rfeR2L_test, Y_R2L_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_rfeR2L, X_rfeR2L_test, Y_R2L_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_rfeR2L, X_rfeR2L_test, Y_R2L_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

In [None]:
print("(All Features)")
accuracy = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_U2R, X_U2R_test, Y_U2R_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

In [None]:
print("(Selected Features)")
accuracy = cross_val_score(clf_rfeU2R, X_rfeU2R_test, Y_U2R_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_rfeU2R, X_rfeU2R_test, Y_U2R_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_rfeU2R, X_rfeU2R_test, Y_U2R_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_rfeU2R, X_rfeU2R_test, Y_U2R_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))


In [None]:
sample_indices = np.random.choice(X_rfeDoS_test.shape[0], size=500, replace=False)
X_DoS_test_sample = X_rfeDoS_test[sample_indices]
newcolname_DoS=np.array(newcolname_DoS)
explainer_DoS = shap.KernelExplainer(clf_rfeDoS.predict_proba, X_DoS_test_sample)
shap_values_DoS = explainer_DoS.shap_values(X_DoS_test_sample)

In [None]:
res = shap_values_DoS[:,:,0]
shap.summary_plot(res, X_DoS_test_sample, feature_names=newcolname_DoS)

In [None]:
shap.summary_plot(res, X_DoS_test_sample, feature_names=newcolname_DoS, plot_type="bar")

In [None]:
sample_indices = np.random.choice(X_rfeProbe_test.shape[0], size=500, replace=False)
X_Probe_test_sample = X_rfeProbe_test[sample_indices]
newcolname_Probe=np.array(newcolname_Probe)
explainer_Probe = shap.KernelExplainer(clf_rfeProbe.predict_proba, X_Probe_test_sample)
shap_values_Probe = explainer_Probe.shap_values(X_Probe_test_sample)
res = shap_values_Probe[:,:,0]
shap.summary_plot(res, X_Probe_test_sample, feature_names=newcolname_Probe)

In [None]:
shap.summary_plot(res, X_Probe_test_sample, feature_names=newcolname_Probe, plot_type="bar")

In [None]:
sample_indices = np.random.choice(X_rfeR2L_test.shape[0], size=500, replace=False)
X_R2L_test_sample = X_rfeR2L_test[sample_indices]
newcolname_R2L=np.array(newcolname_R2L)
explainer_R2L = shap.KernelExplainer(clf_rfeR2L.predict_proba, X_R2L_test_sample)
shap_values_R2L = explainer_R2L.shap_values(X_R2L_test_sample)
res = shap_values_R2L[:,:,0]
shap.summary_plot(res, X_R2L_test_sample, feature_names=newcolname_R2L)

In [None]:
shap.summary_plot(res, X_R2L_test_sample, feature_names=newcolname_R2L, plot_type="bar")

In [None]:
sample_indices = np.random.choice(X_rfeU2R_test.shape[0], size=500, replace=False)
X_U2R_test_sample = X_rfeU2R_test[sample_indices]
newcolname_U2R=np.array(newcolname_U2R)
explainer_U2R = shap.KernelExplainer(clf_rfeU2R.predict_proba, X_U2R_test_sample)
shap_values_U2R = explainer_U2R.shap_values(X_U2R_test_sample)
res = shap_values_U2R[:,:,0]
shap.summary_plot(res, X_U2R_test_sample, feature_names=newcolname_U2R)