In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
#from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, GridSearchCV
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score
from collections import Counter
from sklearn.neural_network import MLPClassifier
from dask_ml.model_selection import HyperbandSearchCV

In [2]:
folder = "D:/Documents/datasets/entrenamiento/"
df1_file = folder + "trafico_normal.csv"
df2_file = folder + "anomalia_01_sesion_persistente_puerto_no_estandar.csv"
df3_file = folder + "anomalia_02_ping_malformado.csv"
df4_file = folder + "anomalia_03_escaneo_de_puertos_tcp.csv"
df1 = pd.read_csv(df1_file, na_values="?")
df2 = pd.read_csv(df2_file, na_values="?")
df3 = pd.read_csv(df3_file, na_values="?")
df4 = pd.read_csv(df4_file, na_values="?")

df1['traffic'] = 1
df2['traffic'] = 0
df3['traffic'] = 0
df4['traffic'] = 0
print(df1.shape, df2.shape, df3.shape, df4.shape)
frames = [df1, df2, df3, df4]

(4265386, 10) (116, 10) (18649, 10) (986, 10)


In [3]:
df = pd.concat(frames)

In [4]:
df = pd.DataFrame(df).fillna(0)

In [5]:
first_packet_dt = pd.to_datetime(df['First Packet'],  format = '%d-%m-%Y:%H:%M:%S',  errors = 'coerce')
last_packet_dt = pd.to_datetime(df['Last Packet'],  format = '%d-%m-%Y:%H:%M:%S',  errors = 'coerce')

In [6]:
time_lapse = last_packet_dt - first_packet_dt

In [7]:
df['Time Lapse'] = time_lapse.dt.total_seconds()

In [8]:
df.drop('First Packet', axis=1, inplace=True)
df.drop('Last Packet', axis=1, inplace=True)

In [9]:
# Get dummies
df = pd.get_dummies(df, columns=["Protocol", "Destination IP"], prefix={"Protocol":"protocol", "Destination IP":"ip"})

In [10]:
df.drop('Source IP', axis=1, inplace=True)

In [11]:
df['traffic'].value_counts()

1    4265386
0      19751
Name: traffic, dtype: int64

In [12]:
df['traffic'] = df['traffic'].astype('uint8')

In [13]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4285137 entries, 0 to 985
Data columns (total 11 columns):
 #   Column                               Dtype  
---  ------                               -----  
 0   Source Port                          float64
 1   Destination Port                     float64
 2   Source Bytes                         float64
 3   Destination Bytes                    float64
 4   traffic                              uint8  
 5   Time Lapse                           float64
 6   protocol_ICMP                        uint8  
 7   protocol_TCP                         uint8  
 8   protocol_UDP                         uint8  
 9   ip_c4ca4238a0b923820dcc509a6f75849b  uint8  
 10  ip_eccbc87e4b5ce2fe28308fd9f2a7baf3  uint8  
dtypes: float64(5), uint8(6)
memory usage: 220.7 MB


In [14]:
df

Unnamed: 0,Source Port,Destination Port,Source Bytes,Destination Bytes,traffic,Time Lapse,protocol_ICMP,protocol_TCP,protocol_UDP,ip_c4ca4238a0b923820dcc509a6f75849b,ip_eccbc87e4b5ce2fe28308fd9f2a7baf3
0,15000.0,22041.0,0.0,3468.0,1,129.0,0,1,0,1,0
1,15000.0,22041.0,0.0,840.0,1,93.0,0,1,0,1,0
2,15000.0,22041.0,312.0,9404.0,1,50.0,0,1,0,1,0
3,2443.0,32433.0,0.0,166.0,1,0.0,0,1,0,0,1
4,0.0,0.0,84.0,0.0,1,0.0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
981,64629.0,995.0,44.0,0.0,0,1.0,0,1,0,0,1
982,49420.0,9968.0,44.0,0.0,0,1.0,0,1,0,0,1
983,58591.0,999.0,44.0,44.0,0,1.0,0,1,0,0,1
984,53508.0,9998.0,44.0,0.0,0,1.0,0,1,0,0,1


In [15]:
X = df.drop('traffic', axis=1)
y = df['traffic']

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.25,stratify=y)

In [16]:
y_train.value_counts()

1    3199039
0      14813
Name: traffic, dtype: int64

In [17]:
models = []
#models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
#models.append(('LDA', LinearDiscriminantAnalysis()))
#models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
#models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC(gamma='auto')))
#models.append(('NN', MLPClassifier(random_state=1)))

In [18]:
#hypterparameters for LogisticRegression
LR_param_dict = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.01, 10, 100, 1000],
    'solver': ['liblinear'],
    'multi_class': ['ovr']
}

#hypterparameters for LinearDiscriminantAnalysis
LDA_param_dict = {
    "solver": ['svf', 'lsqr', 'eigen']
}

#hypterparameters for KNeighborsClassifier
KNN_param_dict = {
    "n_neighbors": [2,4,8,16],
    "p": [1,2]
}

#hypterparameters for DecisionTreeClassifier
CART_param_dict = {
    "criterion": ['gini', 'entropy'],
    "max_depth": range(1,12),
    "min_samples_split": range(2,10),
    "min_samples_leaf": range(1,9)
}

#hypterparameters for GaussianNB
NB_param_dict = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

#hypterparameters for SVM
SVM_param_dict = {
    "C": [50, 10, 1.0, 0.1, 0.01],
    "kernel": ['poly', 'rbf', 'sigmoid'],
    "gamma": ['scale']
}


#hypterparameters for MLPClassifier
NN_param_dict = {
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    
    'learning_rate': ['constant','adaptive']
}

In [19]:
"""
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    param_dict = name + '_param_dict'
    grid = GridSearchCV(model,
                       param_grid=eval(param_dict),
                       cv=kfold,
                       verbose=1,
                       n_jobs=-1,
                       scoring='roc_auc')
    grid.fit(X_train, y_train)
    print('%s: %s %f' % (name,grid.best_estimator_, grid.best_score_))
    grid_predictions = grid.predict(X_test)
    roc = roc_auc_score(y_test, grid_predictions)
    print('roc', roc)
"""

"\nresults = []\nnames = []\nfor name, model in models:\n    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)\n    param_dict = name + '_param_dict'\n    grid = GridSearchCV(model,\n                       param_grid=eval(param_dict),\n                       cv=kfold,\n                       verbose=1,\n                       n_jobs=-1,\n                       scoring='roc_auc')\n    grid.fit(X_train, y_train)\n    print('%s: %s %f' % (name,grid.best_estimator_, grid.best_score_))\n    grid_predictions = grid.predict(X_test)\n    roc = roc_auc_score(y_test, grid_predictions)\n    print('roc', roc)\n"

In [20]:

results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    param_dict = name + '_param_dict'
    grid = RandomizedSearchCV(model,
                       param_distributions=eval(param_dict),
                       cv=kfold,
                       verbose=1,
                       n_jobs=-1,
                       scoring='roc_auc')
    grid.fit(X_train, y_train)
    print('%s: %s %f' % (name, grid.best_estimator_, grid.best_score_))
    grid_predictions = grid.predict(X_test)
    roc = roc_auc_score(y_test, grid_predictions)
    print('roc', roc)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.8min finished


CART: DecisionTreeClassifier(max_depth=7, min_samples_leaf=7, min_samples_split=9) 0.999728
roc 0.9970565551395579


In [None]:
# load test data
import datetime
test_folder = "D:/Documents/datasets/evaluacion/"
normal = []
anormal = []

In [23]:
for i in range(1, 50):
    dft_file = test_folder + str(i) + ".csv"
    dft = pd.read_csv(dft_file, na_values="?")

    dft = pd.DataFrame(dft).fillna(0)
    columns = ['protocol_ICMP', 'protocol_TCP', 'protocol_UDP']

    # Get dummies
    dft = pd.get_dummies(dft, columns=["Protocol", "Destination IP"], prefix={"Protocol":"protocol", "Destination IP":"ip"})

    first_date = dft['First Packet'][0]
    try: 
        datetime.datetime.strptime(first_date, '%d-%m-%Y:%H:%M:%S')
        dataset_date_format = '%d-%m-%Y:%H:%M:%S'
    except ValueError:
        dataset_date_format = '%Y-%m-%d %H:%M:%S'

    dft['First Packet'] = pd.to_datetime(dft['First Packet'],  format=dataset_date_format,  errors = 'coerce')
    dft['Last Packet'] = pd.to_datetime(dft['Last Packet'],  format=dataset_date_format,  errors = 'coerce')

    time_lapse_t = dft['Last Packet'] - dft['First Packet']

    dft['Time Lapse'] = time_lapse_t.dt.total_seconds()

    dft.drop('Source IP', axis=1, inplace=True)

    dft.drop('First Packet', axis=1, inplace=True)
    dft.drop('Last Packet', axis=1, inplace=True)

    missing_cols = set( df.columns ) - set( dft.columns )
    for c in missing_cols:
        dft[c] = 0
    dft = dft[X_train.columns]

    X_test2 = dft
    
    # Make predictions on validation dataset
    #model = KNeighborsClassifier(n_neighbors=16, p=1)
    # best option: model = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=1)
    model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=7, min_samples_split=9)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test2)

    predic_mean = round(np.mean(predictions), 2)
    trafico = "\033[92mNORMAL\033[0m"
    if predic_mean < 0.5:
        trafico = "\033[93mANORMAL\033[0m"
        anormal.append(i)
    else:
        normal.append(i)

    print(i, predic_mean, f"trafico {trafico}")
    
print('finished')

1 1.0 trafico [92mNORMAL[0m
2 1.0 trafico [92mNORMAL[0m
3 1.0 trafico [92mNORMAL[0m
4 1.0 trafico [92mNORMAL[0m
5 1.0 trafico [92mNORMAL[0m
6 1.0 trafico [92mNORMAL[0m
7 1.0 trafico [92mNORMAL[0m
8 1.0 trafico [92mNORMAL[0m
9 1.0 trafico [92mNORMAL[0m
10 1.0 trafico [92mNORMAL[0m
11 1.0 trafico [92mNORMAL[0m
12 0.0 trafico [93mANORMAL[0m
13 1.0 trafico [92mNORMAL[0m
14 1.0 trafico [92mNORMAL[0m
15 1.0 trafico [92mNORMAL[0m
16 1.0 trafico [92mNORMAL[0m
17 1.0 trafico [92mNORMAL[0m
18 1.0 trafico [92mNORMAL[0m
19 0.0 trafico [93mANORMAL[0m
20 0.0 trafico [93mANORMAL[0m
21 1.0 trafico [92mNORMAL[0m
22 1.0 trafico [92mNORMAL[0m
23 1.0 trafico [92mNORMAL[0m
24 1.0 trafico [92mNORMAL[0m
25 1.0 trafico [92mNORMAL[0m
26 1.0 trafico [92mNORMAL[0m
27 1.0 trafico [92mNORMAL[0m
28 1.0 trafico [92mNORMAL[0m
29 1.0 trafico [92mNORMAL[0m
30 0.37 trafico [93mANORMAL[0m
31 1.0 trafico [92mNORMAL[0m
32 1.0 trafico [92mNORMAL[0m
33 1.0 trafi

In [24]:
anormal

[12, 19, 20, 30, 34, 38, 46]