In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_rows",15)
%matplotlib inline

In [2]:
class dataset:
    kdd_train_2labels = pd.read_pickle("dataset/kdd_train_2labels_20percent.pkl")
    kdd_train_2labels_y = pd.read_pickle("dataset/kdd_train_2labels_y_20percent.pkl")
    
    kdd_test_2labels = pd.read_pickle("dataset/kdd_test_2labels_20percent.pkl")
    kdd_test_2labels_y = pd.read_pickle("dataset/kdd_test_2labels_y_20percent.pkl")
    
    kdd_train_5labels = pd.read_pickle("dataset/kdd_train_5labels_20percent.pkl")
    kdd_train_5labels_y = pd.read_pickle("dataset/kdd_train_5labels_y_20percent.pkl")
    
    kdd_test_5labels = pd.read_pickle("dataset/kdd_test_5labels_20percent.pkl")
    kdd_test_5labels_y = pd.read_pickle("dataset/kdd_test_5labels_y_20percent.pkl")
    

In [3]:
import collections
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import NuSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import metrics

names_noscaling = ["RBF SVM", "Decision Tree", "Random Forest", "Naive Bayes"]
classifiers_noscaling = [
                SVC(),
                DecisionTreeClassifier(max_depth=5),
                RandomForestClassifier(max_depth=5, n_estimators=10),
                GaussianNB()]

names_withscaling = ["Non - Linear SVM", "AdaBoost"]
classifiers_withscaling = [NuSVC(),
                AdaBoostClassifier()]

score = collections.namedtuple("score", ["name", "valid_score" ,"test_score"])
scores = []
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

for name, clf in zip(names_noscaling, classifiers_noscaling):
    print("Classifier: {}".format(name))
    
    x_train, y_train = dataset.kdd_train_2labels.iloc[:,:-2], dataset.kdd_train_2labels_y
    x_test, y_test = dataset.kdd_test_2labels.iloc[:,:-2], dataset.kdd_test_2labels_y
    
    clf_p = make_pipeline(clf)    
    valid_score = cross_val_score(clf_p, x_train, y_train, cv=cv)
    
    scaler = preprocessing.StandardScaler().fit(x_train)
    clf.fit(scaler.transform(x_train), y_train)
    y_pred = clf.predict(x_test)
    test_acc = metrics.accuracy_score(y_test, y_pred) 
    
    scores.append(score(name, valid_score.mean(), test_acc))
    print("Score: {} \n".format(test_acc))

for name, clf in zip(names_withscaling, classifiers_withscaling):
    print("Classifier: {}".format(name))
    
    x_train, y_train = dataset.kdd_train_2labels.iloc[:,:-2], dataset.kdd_train_2labels_y
    x_test, y_test = dataset.kdd_test_2labels.iloc[:,:-2], dataset.kdd_test_2labels_y
    
    clf_p = make_pipeline(preprocessing.StandardScaler(), clf)    
    valid_score = cross_val_score(clf_p, x_train, y_train, cv=cv)
    
    scaler = preprocessing.StandardScaler().fit(x_train)
    clf.fit(scaler.transform(x_train), y_train)
    y_pred = clf.predict(x_test)
    test_acc = metrics.accuracy_score(y_test, y_pred) 
    
    scores.append(score(name, valid_score.mean(), test_acc))
    print("Score: {} \n".format(test_acc))


Classifier: RBF SVM
Score: 0.8171308016877638 

Classifier: Decision Tree
Score: 0.2569620253164557 

Classifier: Random Forest
Score: 0.17248945147679326 

Classifier: Naive Bayes
Score: 0.799071729957806 

Classifier: Non - Linear SVM
Score: 0.8180590717299578 

Classifier: AdaBoost
Score: 0.8188185654008439 



In [4]:
pd.DataFrame(scores)

Unnamed: 0,name,valid_score,test_score
0,RBF SVM,0.971542,0.817131
1,Decision Tree,0.988708,0.256962
2,Random Forest,0.97825,0.172489
3,Naive Bayes,0.616094,0.799072
4,Non - Linear SVM,0.917623,0.818059
5,AdaBoost,0.990097,0.818819


In [5]:
#Naive Bayes