# Assignment 3. Exercise 1: Best possible tree

In [29]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import precision_score,make_scorer,accuracy_score
import pickle

## Loading Data

In [30]:
col_name = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment',
            'urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted',
            'num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds',
            'is_host_login','is_guest_login','count','srv_count','serror_rate',
            'srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate',
            'diff_srv_rate','srv_diff_host_rate','dst_host_count',
            'dst_host_srv_count','dst_host_same_srv_rate',
            'dst_host_diff_srv_rate','dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate',
            'dst_host_serror_rate',
            'dst_host_srv_serror_rate',
            'dst_host_rerror_rate',
            'dst_host_srv_rerror_rate',
           'label']

In [31]:
f = pd.read_csv("trainingDecisionTree.csv",names=col_name)

In [32]:
label = f['label'].apply(lambda x:1 if x=='normal.' else 0)
f['label']=label

In [33]:
f["label"].value_counts()

0    401061
1     98939
Name: label, dtype: int64

In [34]:
X = f.iloc[:,:-1]
y=f['label']

## Preprocessing 

I use "LabelEncoder" for three features: protocol_type, service, flag. In order to convert them to numerical values to be fed into the DecisionTreeClassifier.

In [35]:
le1 = preprocessing.LabelEncoder()
le1.fit(X['protocol_type'])
X['protocol_type'] = le1.transform(X['protocol_type'])

le2 = preprocessing.LabelEncoder()
le2.fit(X['service'])
X['service'] = le2.transform(X['service'])

le3 = preprocessing.LabelEncoder()
le3.fit(X['flag'])
X['flag'] = le3.transform(X['flag'])


I decided to use some techniques for feature selection in order to find as good classifier as possible. As you can see below:

## Removing features with low variance

I used VarianceThreshold module in sklearn for feature selection/dimensionality reduction on KDD cup dataset, to improve estimators’ accuracy scores and to boost performance on KDD datasets. It removes all features whose variance doesn’t meet some threshold.

In [36]:
from sklearn.feature_selection import VarianceThreshold

pd.options.display.float_format = '{:.4f}'.format

In [37]:
X.var()

duration                           546798.4367
protocol_type                           0.3271
service                               182.1743
flag                                    5.0843
src_bytes                     95029238872.6983
dst_bytes                       329348075.3361
land                                    0.0000
wrong_fragment                          0.0019
urgent                                  0.0000
hot                                     0.2020
num_failed_logins                       0.0001
logged_in                               0.1228
num_compromised                         3.5123
root_shell                              0.0001
su_attempted                            0.0000
num_root                                3.8349
num_file_creations                      0.0173
num_shells                              0.0001
num_access_files                        0.0013
num_outbound_cmds                       0.0000
is_host_login                           0.0000
is_guest_logi

In [38]:
sel = VarianceThreshold(threshold=0.1)
sel.fit(X)
X2 = sel.transform(X)

## SelectKBest

I select features according to the 10 highest scores. Based on Sklearn documentation "chi2" Chi-squared stats of non-negative features used for classification tasks.so, I choose that.

In [39]:
from sklearn.feature_selection import SelectKBest, chi2


In [40]:
sel2 = SelectKBest(chi2, k=10)
sel2.fit(X2,y)
X3 = sel2.transform(X2)

In [41]:
X3.shape

(500000, 10)

In [42]:
df = pd.DataFrame(X3,columns=range(10))
df["label"] = y

In [51]:
df.to_csv('featurs10.csv',index=False)

It generates one .csv file called 'features10.csv' which include 10 best features exracted from SelectKBest in sklearn.

# Hyperparamter tuning

For Decesion tree classifier, hyperparameters are tuned using K-fold Cross Validation with K = 5. Grid Search is used to obtain the respective hyperparameter values. Also, In DecisionTreeClassifier() to measure the quality of a split, using criterion=’gini’ as default for the Gini impurity. Eventually, since I am running a gridsearch on Decision tree, it will be the best decision tree that it would have selected.

In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer,precision_score

In [44]:
dtree = DecisionTreeClassifier()

depths = np.arange(1, 21)
num_splits = np.arange(10,500,20)
num_leafs = [1, 5, 10, 20, 50, 100]

parameters={'min_samples_split' : num_splits,'max_depth': depths,'min_samples_leaf':num_leafs}

It takes almost 1 hour and 20 minutes to run :(. Note that you don't need to run it, Just call Load model.

In [45]:
gs = GridSearchCV(estimator=dtree, param_grid=parameters, scoring='accuracy', cv=5, n_jobs=3)
gs.fit(X3,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid={'min_samples_split': array([ 10,  30,  50,  70,  90, 110, 130, 150, 170, 190, 210, 230, 250,
       270, 290, 310, 330, 350, 370, 390, 410, 430, 450, 470, 490]), 'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20]), 'min_samples_leaf': [1, 5, 10, 20, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

### Presistence model 

I used dump and load function to presistence model. 

In [46]:
name = 'model.dat'

with open(name, 'wb') as f:
    pipeline = pickle.dump(gs, f)

## load model

In [47]:
def load_ml():
    with open('model.dat', 'rb') as f:
        model = pickle.load(f)
    return model

# test Function

There is a testing function which take a teset data set as .csv file as test points and labels and returns the classification error of model.

In [48]:
def predict_test(test_data):
    
    X_test = test_data.iloc[:,:-1]
    
    y_test = test_data['label']
    y_test = y_test.apply(lambda x:1 if x=='normal.' else 0)
    
    X_test['protocol_type'] = le1.transform(X_test['protocol_type'])
    X_test['service'] = le2.transform(X_test['service'])
    X_test['flag'] = le3.transform(X_test['flag'])
    X_test = sel.transform(X_test)
    X_test = sel2.transform(X_test)
    
    model = load_ml()
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred, normalize=False)
    
    error = X_test.shape[0] - acc
    
    return error

### add your test file path

Please just add the name of csv file below instead "???.csv"

In [49]:
testdata = pd.read_csv("???.csv",names=col_name)

In [50]:
predict_test(testdata)

0