In [1]:
import pandas as pd
import numpy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.model_selection import train_test_split

In [18]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

raw_data = pd.read_csv("/Users/samanthawise/Documents/Bristol/MASTERS/Data Science Toolbox/Workshops/data/kddcup.data_10_percent.gz", header=None, names = col_names)

In [19]:
# Categorize columns: "protocol", "service", "flag", "attack_type"
raw_data['protocol_type'], protocols = pd.factorize(raw_data['protocol_type'])
raw_data['service'], services = pd.factorize(raw_data['service'])
raw_data['flag'], flags    = pd.factorize(raw_data['flag'])
raw_data['label'], attacks = pd.factorize(raw_data['label'])

# separate features (columns 1..40) and label (column 41)
features= raw_data.iloc[:,:raw_data.shape[1]-1]
labels= raw_data.iloc[:,raw_data.shape[1]-1:]

# convert them into numpy arrays
#features= numpy.array(features)
#labels= numpy.array(labels).ravel() # this becomes an 'horizontal' array
labels= labels.values.ravel() # this becomes a 'horizontal' array

# TODO: get features names and target name

# Separate data in train set and test set
df= pd.DataFrame(features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2)
print("X_train, y_train:", X_train.shape, y_train.shape)
print("X_test, y_test:", X_test.shape, y_test.shape)


X_train, y_train: (395216, 41) (395216,)
X_test, y_test: (98805, 41) (98805,)


In [20]:
# Training, choose model by commenting/uncommenting clf=
print("Training model")
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
#clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)

trained_model= clf.fit(X_train, y_train)

print("Score: ", trained_model.score(X_train, y_train))

Training model
Score:  0.9999974697380672


In [23]:
# Predicting
y_pred = clf.predict(X_test)
y_pred

array([0, 4, 4, ..., 4, 5, 0])

In [24]:
results = confusion_matrix(y_test, y_pred)
error = zero_one_loss(y_test, y_pred)

print("Confusion matrix:\n", results)

Confusion matrix:
 [[19465     0     0     0     0     0     0     1     0     0     1     0
      0     0     0     0     0     0     1]
 [    5     6     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    1     0     1     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    2     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0 21554     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0 55986     0     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0    10     0     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0    50     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0    

In [25]:
print("Error: ", error)

Error:  0.00024290268711102403
