# Intrusion detection learning (KDD99)
http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

In [3]:
%pylab inline
import pandas as pd
import logging

Populating the interactive namespace from numpy and matplotlib


In [4]:
with open('../data/kddcup.names', 'r') as fd:
    line = fd.readline().strip()
    attack_types = line.split(',')
    feature_names = list()
    symbolic_feat = list()
    continuous_feat = list()
    for line in fd.readlines():
        feat_name, feat_type = line.strip().split(': ', 2)
        feature_names.append(feat_name)
        if feat_type == 'symbolic.':
            symbolic_feat.append(feat_name)
        elif feat_type == 'continuous.':
            continuous_feat.append(feat_name)
        else:
            logging.warning("Unsupported feature type: %s", feat_type)

In [5]:
ds = pd.read_csv('../data/kddcup.data_10_percent.gz', names=feature_names + ['class'])

In [6]:
ds.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [7]:
ds['class'].value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: class, dtype: int64

In [8]:
ds['target'] = ds['class'].map(lambda x: 1 if x == 'normal.' else 0)

In [9]:
X = ds[continuous_feat]
y = ds['target']

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
logreg.score(X_test, y_test)

0.9838267294165275

In [14]:
from sklearn.metrics import classification_report, f1_score

y_pred = logreg.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     79548
           1       0.93      0.99      0.96     19257

    accuracy                           0.98     98805
   macro avg       0.96      0.99      0.97     98805
weighted avg       0.98      0.98      0.98     98805



In [15]:
f1_score(y_test, y_pred) 

0.9597014172592928

## Сохраним коэффициенты LogReg

In [26]:
coef = np.append(logreg.intercept_, logreg.coef_)
np.savetxt("logreg_coef.txt", coef)

In [27]:
!head logreg_coef.txt

3.503387611756539257e-04
4.081782709356486256e-04
-5.079810653116837148e-05
1.907934983748948367e-08
-1.955590469010541014e-04
-2.877615012702586427e-07
-1.555647162978123898e-03
-5.521095825288340983e-06
-2.808624328179518841e-04
-3.929177498590480974e-06


## Генерируем тестовые данные для LogReg

In [25]:
X_sample = X_test.sample(10).values
y_pred_proba_logreg = logreg.predict_proba(X_sample)[:, 1].reshape(-1, 1)

np.savetxt("test_data_logreg.txt", np.hstack((y_pred_proba_logreg, X_sample)), fmt="%g")

# Обучаем модель на основе CatBoost

In [19]:
from catboost import CatBoostClassifier

catbst = CatBoostClassifier(iterations=10).fit(X_train, y_train, eval_set=(X_test, y_test))

Learning rate set to 0.5
0:	learn: 0.0367262	test: 0.0369478	best: 0.0369478 (0)	total: 429ms	remaining: 3.86s
1:	learn: 0.0177141	test: 0.0182160	best: 0.0182160 (1)	total: 539ms	remaining: 2.15s
2:	learn: 0.0133632	test: 0.0139314	best: 0.0139314 (2)	total: 658ms	remaining: 1.54s
3:	learn: 0.0078670	test: 0.0080113	best: 0.0080113 (3)	total: 798ms	remaining: 1.2s
4:	learn: 0.0069044	test: 0.0070291	best: 0.0070291 (4)	total: 914ms	remaining: 914ms
5:	learn: 0.0043795	test: 0.0042546	best: 0.0042546 (5)	total: 1.1s	remaining: 732ms
6:	learn: 0.0041072	test: 0.0040024	best: 0.0040024 (6)	total: 1.25s	remaining: 536ms
7:	learn: 0.0037091	test: 0.0035889	best: 0.0035889 (7)	total: 1.36s	remaining: 340ms
8:	learn: 0.0032038	test: 0.0031155	best: 0.0031155 (8)	total: 1.49s	remaining: 166ms
9:	learn: 0.0025375	test: 0.0024277	best: 0.0024277 (9)	total: 1.65s	remaining: 0us

bestTest = 0.002427740038
bestIteration = 9



In [21]:
y_pred_cb = catbst.predict(X_test)
f1_score(y_test, y_pred_cb)

0.9986508924865088

## Сохраняем модель catboost

In [28]:
catbst.save_model('cbmodel.cbm', format="cpp")

## Генерируем тестовые данные для CatBoost

In [29]:
X_sample = X_test.sample(10).values
y_pred_proba_logreg = catbst.predict_proba(X_sample)[:, 1].reshape(-1, 1)

np.savetxt("test_data_catboost.txt", np.hstack((y_pred_proba_logreg, X_sample)), fmt="%g")

In [31]:
y_pred_proba_logreg

array([[1.85612013e-04],
       [1.85612013e-04],
       [1.85612013e-04],
       [9.97083918e-01],
       [9.99662847e-01],
       [1.85612013e-04],
       [9.99248204e-01],
       [2.41811690e-05],
       [9.99551188e-01],
       [9.99551188e-01]])