In [136]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import  MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support, make_scorer, recall_score, f1_score, confusion_matrix, precision_score, balanced_accuracy_score
                           


## Prepare Data (Malicious hacks)

In [13]:
df = pd.read_csv("data/malicious_01.csv")


In [144]:
Xdf = df.drop(columns=["INCIDENT_ID","DATE", "MALICIOUS_OFFENSE"])
y = df["MALICIOUS_OFFENSE"]
Xdf['X_12'] = np.nan_to_num(Xdf['X_12'])
X, y = Xdf.values, y.values

## Decsion Tree Classifier

In [46]:
dt =  DecisionTreeClassifier()
# ??dt.score

In [181]:
dt_grid = GridSearchCV(
    estimator=dt, 
    param_grid={"max_depth": [3,5,7,10,15, 20]},
    scoring={'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score), 'bal_ac': make_scorer(balanced_accuracy_score),
            'f1':make_scorer(f1_score, pos_label=0)},
    refit='f1',
    return_train_score=True,
    cv=5
)

In [55]:
np.where(np.isnan(X))


(array([], dtype=int64), array([], dtype=int64))

In [182]:
dt_grid.fit(X,y)
dfDtRes = pd.DataFrame(dt_grid.cv_results_)
dfDtRes

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_recall,split1_test_recall,split2_test_recall,split3_test_recall,...,mean_test_f1,std_test_f1,rank_test_f1,split0_train_f1,split1_train_f1,split2_train_f1,split3_train_f1,split4_train_f1,mean_train_f1,std_train_f1
0,0.015403,0.001199,0.007799,0.001402,3,{'max_depth': 3},0.999122,1.0,0.999561,0.999342,...,0.667145,0.022749,6,0.674865,0.668716,0.659426,0.662539,0.671791,0.667467,0.005726
1,0.015118,0.000925,0.006239,0.000385,5,{'max_depth': 5},0.998464,0.999342,0.998903,0.999561,...,0.85012,0.014433,5,0.859211,0.852825,0.847682,0.85056,0.853755,0.852806,0.003827
2,0.014623,0.00041,0.006488,0.000343,7,{'max_depth': 7},0.998903,0.998903,0.998464,0.999561,...,0.959352,0.008378,4,0.971974,0.966547,0.976303,0.97323,0.969479,0.971507,0.003314
3,0.015407,0.000833,0.006145,0.00061,10,{'max_depth': 10},0.999342,0.999122,0.999122,0.999561,...,0.980183,0.009282,2,0.996475,0.998829,0.997064,0.996491,0.997658,0.997303,0.000878
4,0.015715,0.000754,0.006028,0.000344,15,{'max_depth': 15},0.999122,0.999122,0.999342,0.999342,...,0.981176,0.007206,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.015698,0.000595,0.006245,0.000333,20,{'max_depth': 20},0.999342,0.999342,0.999561,0.999122,...,0.979753,0.004957,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [180]:
dfDtRes.filter(['mean_test_recall','mean_test_precision','mean_test_bal_ac','mean_test_f1'  ], axis=1)

Unnamed: 0,mean_test_recall,mean_test_precision,mean_test_bal_ac,mean_test_f1
0,0.999473,0.977386,0.753017,0.50656
1,0.999078,0.988624,0.876886,0.754693
2,0.999166,0.997198,0.969615,0.940064
3,0.999342,0.998816,0.987036,0.974731
4,0.99943,0.998816,0.98708,0.974731
5,0.999386,0.998948,0.988467,0.977548


In [160]:
precision_recall_fscore_support(y, dt_grid.predict(X))

(array([1., 1.]), array([1., 1.]), array([1., 1.]), array([ 1068, 22788]))

In [159]:
confusion_matrix(y, dt_grid.best_estimator_.predict(X))

array([[ 1068,     0],
       [    0, 22788]])

In [154]:
# dt_grid.


DecisionTreeClassifier()

## Neural Network Classifier

In [70]:
from sklearn.neural_network import MLPClassifier

In [139]:


mlp =  MLPClassifier()

mlp_grid = GridSearchCV(
    estimator=mlp, 
    param_grid={'hidden_layer_sizes':[(10,),(5,2)]},
    scoring={'recall':make_scorer(recall_score), 'precision':make_scorer(precision_score)},
    refit='precision',
    cv=5,
    n_jobs=-1

)

In [155]:
Xs = MinMaxScaler().fit_transform(Xdf)



mlp_grid.fit(Xs,y)
# (pd.DataFrame(mlp_grid.cv_results_))




GridSearchCV(cv=5, estimator=MLPClassifier(),
             param_grid={'hidden_layer_sizes': [(10,), (5, 2)]},
             refit='precision',
             scoring={'precision': make_scorer(precision_score),
                      'recall': make_scorer(recall_score)})

In [141]:
precision_recall_fscore_support(y, mlp_grid.predict(X))

(array([0.90031153, 0.97889205]),
 array([0.5411985, 0.9971915]),
 array([0.67602339, 0.98795705]),
 array([ 1068, 22788]))

In [107]:
confusion_matrix(y, mlp_grid.predict(X))

array([[  475,   593],
       [  101, 22687]])

## SVM Classifier

In [91]:
from sklearn.svm import SVC

svc = SVC(C=0.025)


svc_grid = GridSearchCV(
    estimator=svc, 
    param_grid={'kernel':['linear', 'poly', 'rbf', 'sigmoid']},
    scoring={'recall':make_scorer(recall_score), 'f1':make_scorer(f1_score)},
    refit='f1',
    cv=5
)



In [92]:
svc_grid.fit(X,y)

GridSearchCV(cv=5, estimator=SVC(C=0.025),
             param_grid={'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             refit='f1',
             scoring={'f1': make_scorer(f1_score),
                      'recall': make_scorer(recall_score)})

In [93]:
pd.DataFrame(svc_grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,params,split0_test_recall,split1_test_recall,split2_test_recall,split3_test_recall,...,std_test_recall,rank_test_recall,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,5.295788,0.635463,0.203046,0.006722,linear,{'kernel': 'linear'},1.0,1.0,1.0,1.0,...,0.0,1,0.977063,0.977058,0.977058,0.977168,0.977168,0.977103,5.3e-05,1
1,1.103775,0.050303,0.179461,0.006863,poly,{'kernel': 'poly'},1.0,1.0,1.0,1.0,...,0.0,1,0.977063,0.977058,0.977058,0.977168,0.977168,0.977103,5.3e-05,1
2,1.583398,0.056372,0.656272,0.01213,rbf,{'kernel': 'rbf'},1.0,1.0,1.0,1.0,...,0.0,1,0.977063,0.977058,0.977058,0.977168,0.977168,0.977103,5.3e-05,1
3,1.475554,0.023757,0.258448,0.005557,sigmoid,{'kernel': 'sigmoid'},1.0,1.0,1.0,1.0,...,0.0,1,0.977063,0.977058,0.977058,0.977168,0.977168,0.977103,5.3e-05,1


In [127]:
confusion_matrix(y, svc_grid.best_estimator_.predict(X))

array([[    0,  1068],
       [    0, 22788]])

In [128]:
svc_grid.best_params_

{'kernel': 'linear'}