In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import itertools
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import pickle

from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.metrics import confusion_matrix

import logging
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.WARNING,  # set 3rd party logs to warning (for hiding it)
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

plt.style.use('ggplot')

# 1. Define Variables and Functions

## 1.1 Train Model Variables

In [4]:
model_details_django = {
    'isolation_forest': [],
    'local_outilier': [],
    'elliptic_envelope': [],
}

model_details_nginx = {
    'isolation_forest': [],
    'local_outilier': [],
    'elliptic_envelope': [],
}

model_details_hash = {
    'isolation_forest': [],
    'local_outilier': [],
    'elliptic_envelope': [],
}

In [6]:
model_names = [
    'elliptic_envelope',
    'isolation_forest', 
    'local_outilier'
]

model_classes = {
    'isolation_forest': IsolationForest,
    'local_outilier': LocalOutlierFactor,
    'elliptic_envelope': EllipticEnvelope,
}

parameters = {
    'isolation_forest': {
        'contamination': np.linspace(0.01, 0.5, num=10),         
    },
    'local_outilier': {
        'n_neighbors': np.linspace(1, 200, num=5, dtype='int'),
        'novelty': [True]
    },
    'elliptic_envelope': {
        'contamination': np.linspace(0.01, 0.5, num=3),  
    },
}



## 1.2 Define Train Model Function

In [3]:
def calculate_metric(y_true, y_pred):
    confusion_matrix_data = confusion_matrix(y_true, y_pred) # tn, fp, fn, tp
    TN, FP, FN, TP = confusion_matrix_data.ravel()
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    specificity = TN/ (TN + FP)
    accuracy = (TP + TN) / (TP + FP + TN + FN)
    
    if precision + recall == 0:
        F1 = 'devide by zero'
    else:
        F1 = 2 * precision * recall / (precision + recall)
        F1 = f'{F1:.4f}'
        
    result = {
        'confusion_matrix': confusion_matrix_data,
        'TN': TN,
        'FP': FP,
        'FN': FN,
        'TP': TP,
        'precision': precision,
        'recall': recall,
        'specificity': specificity,
        'accuracy': accuracy,
        'F1': F1,        
    }
    
    return result


In [14]:
def get_model_result(model_name, df_train, df_test, select_column):
    logger.info(f"{model_name}") #print(f"{model_name}")                
    model_detail = [] 
    keys = list(parameters[model_name].keys())
    values = parameters[model_name].values()
    all_combinations = list(itertools.product(*values))
    for parameter_value in all_combinations:
        parameter_result = {}
        for i_parameter in range(len(parameter_value)):
            parameter_result[keys[i_parameter]] = parameter_value[i_parameter]

        logger.info(f'           parameter_result {parameter_result}')    
        model =  model_classes[model_name](**parameter_result)   
        model.fit(df_train[select_column].values)
        predict_result = model.predict(df_test[select_column].values)
        test_set_tmp = df_test.copy(deep=True)
        test_set_tmp['predict'] = pd.Series(predict_result).apply(lambda x: 1 if (x == -1) else 0 )        

        y_true = test_set_tmp['abnormal']
        y_pred = test_set_tmp['predict']
        metric_result = calculate_metric(y_true, y_pred)

        result = {
            "parameter": parameter_result,
            "metric_result": metric_result,
            "predict": test_set_tmp,
#             "model": model,    # uncomment this line to save trained model in pickle
        }

        model_detail.append(result)
    return model_detail

## 1.3 Define Prepare Data Function

In [5]:
error_word = ['404', 'not found', 'internal server error', 'error', 'timed out'] 

def count_error_word(df, error_word):
    for item in error_word:
        df[item] = df['log'].str.contains(item).astype(int)
    return df

def set_train_data(df, error_word):
    for item in error_word:
        df[item] = np.random.choice([0,1,2], df.shape[0], p=[0.7, 0.2, 0.1]) # 0
    return df

def group_time(df):
    df = df[error_word].groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
    df.loc[df.sum(axis=1) > 0, 'abnormal'] = 1
    df['abnormal'] = df['abnormal'].fillna(0)
    df = df.reset_index()
    return df

def print_F1(parameter_results):
    F1_list = []
    for asset_result in parameter_results:
        try:
            F1 = float(asset_result['metric_result']['F1'])
        except Exception as e:
            F1 = 0

        F1_list.append(F1)
    print(f'     F1 max : {max(F1_list)}, F1 of each hyper: {F1_list}')

## 1.4 Define Plot Function

In [16]:
def plot_dataset(df_data):
    plt.rcParams["figure.figsize"] = (15,15)
    fig = plt.figure()
    axs = fig.subplots(2, 1)

    data = df_data['predict']

    predict_normal = data[data['predict'] == 0]
    predict_abnormal = data[data['predict'] == 1]
    axs[0].set_title(f'Predict', ha='left', va='center',y=0.5,x=1.009)
    axs[0].set_ylim([0, 70])
    axs[0].plot(predict_normal['datetime'], predict_normal['error'],'b-',label='$predicted as normal')
    for item in error_word: 
        axs[0].plot(predict_abnormal['datetime'], predict_abnormal[item], 'r.', label=f'$predicted as abnormal')
    axs[0].legend()

    ground_truth_normal = data[data['abnormal'] == 0]
    ground_truth_abnormal = data[data['abnormal'] == 1]
    axs[1].set_title(f'Ground Truth', ha='left', va='center',y=0.5,x=1.009)
    axs[1].set_ylim([0, 70])
    axs[1].plot(ground_truth_normal['datetime'], ground_truth_normal['error'],'b-',label='$ground_truth as normal')
    for item in error_word: 
        axs[1].plot(ground_truth_abnormal['datetime'], ground_truth_abnormal[item], '.', label=f'$ground_truth as {item}')
    axs[1].legend()

    plt.show()

# 2. Bag-of-word

## 2.1 process django backend logs

### 2.1.1 parse logs file

In [2]:
log_data  = open('input/backend.log', 'r')
data_list = []
last_date = datetime.strptime('09/Aug/2022 23:50:16', '%d/%b/%Y %H:%M:%S')
for i, line in enumerate(log_data):
    date_str = line[32:52]
    try:
        datetime_data = datetime.strptime(date_str, '%d/%b/%Y %H:%M:%S')
        last_date = datetime_data
        log = line[54:]
    except Exception as e:
        datetime_data = last_date
        log = line[31:]

    data_list.append({
        'datetime': datetime_data, 
        'log': log,
    })    

In [3]:
df_django = pd.DataFrame(data_list)
df_django['log'] = df_django['log'].str.lower()
df_django['datetime'] = pd.to_datetime(df_django['datetime'])
df_django = df_django.set_index('datetime')
df_django_original = df_django.copy(deep=True)
df_django

Unnamed: 0_level_0,log
datetime,Unnamed: 1_level_1
2022-08-10 00:29:00,"""get /api/v1/profile/30/ http/1.0"" 200 105\n"
2022-08-10 00:29:00,"""patch /api/v1/profile/43/ http/1.0"" 200 105\n"
2022-08-10 00:29:00,"""patch /api/v1/profile/48/ http/1.0"" 200 105\n"
2022-08-10 00:29:00,"""patch /api/v1/profile/36/ http/1.0"" 200 105\n"
2022-08-10 00:29:00,"""patch /api/v1/profile/43/ http/1.0"" 200 105\n"
...,...
2022-08-10 01:29:01,"""get /api/v1/profile/48/ http/1.0"" 200 105\n"
2022-08-10 01:29:01,"""get /api/v1/profile/1/ http/1.0"" 200 100\n"
2022-08-10 01:29:01,"""patch /api/v1/profile/39/ http/1.0"" 200 105\n"
2022-08-10 01:29:01,"""patch /api/v1/profile/11/ http/1.0"" 200 105\n"


### 2.1.2 count and summarize error logs in every 10 seconds

In [6]:
df_django = count_error_word(df_django, error_word)
df_django_group_test = group_time(df_django)
df_django_group_test

df_django_group_train = set_train_data(df_django_group_test.copy(deep=True), error_word)
df_django_group_train['abnormal'] = 0
df_django_group_train = df_django_group_train.reset_index()
df_django_group_train

The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df = df[error_word].groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()


Unnamed: 0,index,datetime,404,not found,internal server error,error,timed out,abnormal
0,0,2022-08-10 00:29:10,0,0,0,0,0,0
1,1,2022-08-10 00:29:20,1,2,0,1,0,0
2,2,2022-08-10 00:29:30,0,0,0,0,0,0
3,3,2022-08-10 00:29:40,0,2,0,0,0,0
4,4,2022-08-10 00:29:50,0,0,1,0,2,0
...,...,...,...,...,...,...,...,...
356,356,2022-08-10 01:28:30,0,0,0,0,0,0
357,357,2022-08-10 01:28:40,0,1,0,2,0,0
358,358,2022-08-10 01:28:50,0,0,0,0,0,0
359,359,2022-08-10 01:29:00,0,0,0,0,1,0


In [7]:
df_django_group_test[df_django_group_test['abnormal'] == 1]

Unnamed: 0,datetime,404,not found,internal server error,error,timed out,abnormal
121,2022-08-10 00:49:20,50,50,0,0,0,1.0
240,2022-08-10 01:09:10,0,0,6,18,0,1.0
241,2022-08-10 01:09:20,0,0,14,42,0,1.0
242,2022-08-10 01:09:30,0,0,13,39,0,1.0
243,2022-08-10 01:09:40,0,0,6,18,0,1.0
244,2022-08-10 01:09:50,0,0,13,39,0,1.0
245,2022-08-10 01:10:00,0,0,14,42,0,1.0
246,2022-08-10 01:10:10,0,0,12,36,0,1.0


### 2.1.3 Train

#### Train isolation_forest

In [11]:
model_name = 'isolation_forest'
model_details_django[model_name] = get_model_result(model_name, df_django_group_train, df_django_group_test, error_word)

2022-08-18 16:10:58 INFO     isolation_forest
2022-08-18 16:10:58 INFO                parameter_result {'contamination': 0.01}
  precision = TP / (TP + FP)
2022-08-18 16:10:58 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:10:58 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:10:58 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:10:58 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:10:58 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:10:58 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:10:59 INFO                parameter_result {'contamination': 0.3911111111111111}
2022-08-18 16:10:59 INFO                parameter_result {'contamination': 0.44555555555555554}
2022-08-18 16:10:59 INFO                parameter_result {'contamination': 0.5

#### Train local_outilier

In [12]:
model_name = 'local_outilier'
model_details_django[model_name] = get_model_result(model_name, df_django_group_train, df_django_group_test, error_word)

2022-08-18 16:11:00 INFO     local_outilier
2022-08-18 16:11:00 INFO                parameter_result {'n_neighbors': 1, 'novelty': True}
2022-08-18 16:11:00 INFO                parameter_result {'n_neighbors': 50, 'novelty': True}
2022-08-18 16:11:00 INFO                parameter_result {'n_neighbors': 100, 'novelty': True}
2022-08-18 16:11:00 INFO                parameter_result {'n_neighbors': 150, 'novelty': True}
2022-08-18 16:11:00 INFO                parameter_result {'n_neighbors': 200, 'novelty': True}


#### Train Elliptic_envelope

In [13]:
model_name = 'elliptic_envelope'
model_details_django[model_name] = get_model_result(model_name, df_django_group_train, df_django_group_test, error_word)

2022-08-18 16:11:00 INFO     elliptic_envelope
2022-08-18 16:11:00 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:11:00 INFO                parameter_result {'contamination': 0.255}
2022-08-18 16:11:00 INFO                parameter_result {'contamination': 0.5}


In [14]:
df_django_group_test[df_django_group_test['abnormal'] > 0]

Unnamed: 0,datetime,404,not found,internal server error,error,timed out,abnormal
121,2022-08-10 00:49:20,50,50,0,0,0,1.0
240,2022-08-10 01:09:10,0,0,6,18,0,1.0
241,2022-08-10 01:09:20,0,0,14,42,0,1.0
242,2022-08-10 01:09:30,0,0,13,39,0,1.0
243,2022-08-10 01:09:40,0,0,6,18,0,1.0
244,2022-08-10 01:09:50,0,0,13,39,0,1.0
245,2022-08-10 01:10:00,0,0,14,42,0,1.0
246,2022-08-10 01:10:10,0,0,12,36,0,1.0


### 2.1.4 Max F1 Result

In [15]:
print_F1(model_details_django['isolation_forest'])

     F1 max : nan, F1 of each hyper: [nan, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [17]:
print_F1(model_details_django['local_outilier'])

     F1 max : 1.0, F1 of each hyper: [1.0, 1.0, 1.0, 1.0, 1.0]


In [19]:
print_F1(model_details_django['elliptic_envelope'])

     F1 max : 1.0, F1 of each hyper: [1.0, 1.0, 1.0]


In [20]:
model_details_django['elliptic_envelope']

[{'parameter': {'contamination': 0.01},
  'metric_result': {'confusion_matrix': array([[353,   0],
          [  0,   8]]),
   'TN': 353,
   'FP': 0,
   'FN': 0,
   'TP': 8,
   'precision': 1.0,
   'recall': 1.0,
   'specificity': 1.0,
   'accuracy': 1.0,
   'F1': '1.0000'},
  'predict':                datetime  404  not found  internal server error  error  \
  0   2022-08-10 00:29:10    0          0                      0      0   
  1   2022-08-10 00:29:20    0          0                      0      0   
  2   2022-08-10 00:29:30    0          0                      0      0   
  3   2022-08-10 00:29:40    0          0                      0      0   
  4   2022-08-10 00:29:50    0          0                      0      0   
  ..                  ...  ...        ...                    ...    ...   
  356 2022-08-10 01:28:30    0          0                      0      0   
  357 2022-08-10 01:28:40    0          0                      0      0   
  358 2022-08-10 01:28:50    0         

In [21]:
model_details_django['local_outilier']

[{'parameter': {'n_neighbors': 1, 'novelty': True},
  'metric_result': {'confusion_matrix': array([[353,   0],
          [  0,   8]]),
   'TN': 353,
   'FP': 0,
   'FN': 0,
   'TP': 8,
   'precision': 1.0,
   'recall': 1.0,
   'specificity': 1.0,
   'accuracy': 1.0,
   'F1': '1.0000'},
  'predict':                datetime  404  not found  internal server error  error  \
  0   2022-08-10 00:29:10    0          0                      0      0   
  1   2022-08-10 00:29:20    0          0                      0      0   
  2   2022-08-10 00:29:30    0          0                      0      0   
  3   2022-08-10 00:29:40    0          0                      0      0   
  4   2022-08-10 00:29:50    0          0                      0      0   
  ..                  ...  ...        ...                    ...    ...   
  356 2022-08-10 01:28:30    0          0                      0      0   
  357 2022-08-10 01:28:40    0          0                      0      0   
  358 2022-08-10 01:28:50  

In [1]:
# plot_dataset(model_details_django['local_outilier'][0])  # uncomment to see visualization

## 2.2. process Nginx logs

### 2.2.1  parse logs data

In [8]:
log_data  = open('input/nginx.log', 'r')
data_list = []
last_date = datetime.strptime('09/Aug/2022:23:50:16', '%d/%b/%Y:%H:%M:%S')
for i, line in enumerate(log_data):
    date_str = line[45:65]
    try:
        datetime_data = datetime.strptime(date_str, '%d/%b/%Y:%H:%M:%S')
        last_date = datetime_data
        log = text[72:]
    except Exception as e:
        datetime_data = last_date
        log = line[29:]

    data_list.append({
        'datetime': datetime_data, 
        'log': log,
    })

In [9]:
df_nginx = pd.DataFrame(data_list)
df_nginx['log'] = df_nginx['log'].str.lower()
df_nginx = df_nginx.set_index('datetime')
df_nginx_original = df_nginx.copy(deep=True)
df_nginx

Unnamed: 0_level_0,log
datetime,Unnamed: 1_level_1
2022-08-10 00:29:00,"172.22.0.1 - - [10/aug/2022:00:29:00 +0000] ""p..."
2022-08-10 00:29:00,"172.22.0.1 - - [10/aug/2022:00:29:00 +0000] ""p..."
2022-08-10 00:29:00,"172.22.0.1 - - [10/aug/2022:00:29:00 +0000] ""g..."
2022-08-10 00:29:00,"172.22.0.1 - - [10/aug/2022:00:29:00 +0000] ""g..."
2022-08-10 00:29:00,"172.22.0.1 - - [10/aug/2022:00:29:00 +0000] ""g..."
...,...
2022-08-10 01:29:01,"172.22.0.1 - - [10/aug/2022:01:29:01 +0000] ""g..."
2022-08-10 01:29:01,"172.22.0.1 - - [10/aug/2022:01:29:01 +0000] ""g..."
2022-08-10 01:29:01,"172.22.0.1 - - [10/aug/2022:01:29:01 +0000] ""p..."
2022-08-10 01:29:01,"172.22.0.1 - - [10/aug/2022:01:29:01 +0000] ""p..."


### 2.2.2 count and summarize error logs in every 10 seconds

In [10]:
df_nginx = count_error_word(df_nginx, error_word)
df_nginx_group_test = group_time(df_nginx)
df_nginx_group_test

df_nginx_group_train = set_train_data(df_nginx_group_test.copy(deep=True), error_word)
df_nginx_group_train['abnormal'] = 0
df_nginx_group_train = df_nginx_group_train.reset_index()
df_nginx_group_train

The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df = df[error_word].groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()


Unnamed: 0,index,datetime,404,not found,internal server error,error,timed out,abnormal
0,0,2022-08-10 00:29:10,1,0,0,0,0,0
1,1,2022-08-10 00:29:20,0,0,1,0,0,0
2,2,2022-08-10 00:29:30,0,0,0,0,0,0
3,3,2022-08-10 00:29:40,0,1,2,2,0,0
4,4,2022-08-10 00:29:50,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
356,356,2022-08-10 01:28:30,0,0,0,0,0,0
357,357,2022-08-10 01:28:40,0,1,2,1,0,0
358,358,2022-08-10 01:28:50,0,0,0,0,1,0
359,359,2022-08-10 01:29:00,0,0,0,0,1,0


In [11]:
df_nginx_group_test[df_nginx_group_test['abnormal'] > 0]

Unnamed: 0,datetime,404,not found,internal server error,error,timed out,abnormal
121,2022-08-10 00:49:20,50,0,0,0,0,1.0
240,2022-08-10 01:09:10,0,0,0,6,0,1.0
241,2022-08-10 01:09:20,0,0,0,14,0,1.0
242,2022-08-10 01:09:30,0,0,0,13,0,1.0
243,2022-08-10 01:09:40,0,0,0,6,0,1.0
244,2022-08-10 01:09:50,0,0,0,13,0,1.0
245,2022-08-10 01:10:00,0,0,0,14,0,1.0
246,2022-08-10 01:10:10,0,0,0,12,0,1.0
309,2022-08-10 01:20:40,0,0,0,10,10,1.0
316,2022-08-10 01:21:50,0,0,0,10,10,1.0


### 2.2.3 Train

#### Train isolation_forest

In [27]:
model_name = 'isolation_forest'
model_details_nginx[model_name] = get_model_result(model_name, df_nginx_group_train, df_nginx_group_test, error_word)

2022-08-18 16:11:05 INFO     isolation_forest
2022-08-18 16:11:05 INFO                parameter_result {'contamination': 0.01}
  precision = TP / (TP + FP)
2022-08-18 16:11:05 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:11:05 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:11:05 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:11:05 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:11:05 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:11:05 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:11:05 INFO                parameter_result {'contamination': 0.3911111111111111}
2022-08-18 16:11:05 INFO                parameter_result {'contamination': 0.44555555555555554}
2022-08-18 16:11:06 INFO                parameter_result {'contamination': 0.5

#### Train local_outilier

In [28]:
model_name = 'local_outilier'
model_details_nginx[model_name] = get_model_result(model_name, df_nginx_group_train, df_nginx_group_test, error_word)

2022-08-18 16:11:06 INFO     local_outilier
2022-08-18 16:11:06 INFO                parameter_result {'n_neighbors': 1, 'novelty': True}
2022-08-18 16:11:06 INFO                parameter_result {'n_neighbors': 50, 'novelty': True}
2022-08-18 16:11:06 INFO                parameter_result {'n_neighbors': 100, 'novelty': True}
2022-08-18 16:11:06 INFO                parameter_result {'n_neighbors': 150, 'novelty': True}
2022-08-18 16:11:06 INFO                parameter_result {'n_neighbors': 200, 'novelty': True}


#### Train elliptic_envelope

In [29]:
model_name = 'elliptic_envelope'
model_details_nginx[model_name] = get_model_result(model_name, df_nginx_group_train, df_nginx_group_test, error_word)

2022-08-18 16:11:06 INFO     elliptic_envelope
2022-08-18 16:11:06 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:11:06 INFO                parameter_result {'contamination': 0.255}
2022-08-18 16:11:06 INFO                parameter_result {'contamination': 0.5}


### 2.2.4 Find Max F1

In [30]:
print_F1(model_details_nginx['isolation_forest'])

     F1 max : nan, F1 of each hyper: [nan, 0.4286, 0.4286, 0.4286, 0.4286, 0.4286, 0.4286, 0.4286, 0.5333, 1.0]


In [32]:
print_F1(model_details_nginx['local_outilier'])

     F1 max : 1.0, F1 of each hyper: [1.0, 1.0, 1.0, 1.0, 1.0]


In [33]:
print_F1(model_details_nginx['elliptic_envelope'])

     F1 max : 1.0, F1 of each hyper: [1.0, 1.0, 1.0]


In [34]:
model_details_nginx['isolation_forest'][8]

{'parameter': {'contamination': 0.44555555555555554},
 'metric_result': {'confusion_matrix': array([[350,   0],
         [  7,   4]]),
  'TN': 350,
  'FP': 0,
  'FN': 7,
  'TP': 4,
  'precision': 1.0,
  'recall': 0.36363636363636365,
  'specificity': 1.0,
  'accuracy': 0.9806094182825484,
  'F1': '0.5333'},
 'predict':                datetime  404  not found  internal server error  error  \
 0   2022-08-10 00:29:10    0          0                      0      0   
 1   2022-08-10 00:29:20    0          0                      0      0   
 2   2022-08-10 00:29:30    0          0                      0      0   
 3   2022-08-10 00:29:40    0          0                      0      0   
 4   2022-08-10 00:29:50    0          0                      0      0   
 ..                  ...  ...        ...                    ...    ...   
 356 2022-08-10 01:28:30    0          0                      0      0   
 357 2022-08-10 01:28:40    0          0                      0      0   
 358 2022-08-1

## 2.3 Save Result

In [62]:
with open('output/model_details_count_django.pickle', 'wb') as handle:
    pickle.dump(model_details_django, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [63]:
with open('output/model_details_count_nginx.pickle', 'wb') as handle:
    pickle.dump(model_details_nginx, handle, protocol=pickle.HIGHEST_PROTOCOL)

# 3. Hash Vectorizer

In [37]:
model_details_hash = {
    'django': {},
    'nginx': {}
}

## 3.1 Define train hash function

In [38]:
def train_hash(df_data, df_data_group_test):
    df_2 = df_data.reset_index()
    model_detail = {}
    for i in range(7,11):
        print(i)
        vectorizer = HashingVectorizer(n_features=2**i)
        vectorizer = vectorizer.fit(df_data['log'])
        spar_data = vectorizer.transform(df_data['log'])


        df = pd.DataFrame(spar_data.toarray())
        df_result = df_2.join(df)
        df_result = df_result.drop(columns=['log']).set_index('datetime')
        df_result_group = df_result.groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
        select_column = df_result_group.columns.tolist()

        df_train = df_result_group.reset_index()
        df_train.loc[df_data_group_test['abnormal'] > 0, 'abnormal'] = 1
        df_train['abnormal'] = df_train['abnormal'].fillna(0)

        if i not in model_detail:
            model_detail[i] = {}

        model_name = 'isolation_forest'
        model_detail[i][model_name] = get_model_result(model_name, df_train, df_train, select_column)

        model_name = 'local_outilier'
        model_detail[i][model_name] = get_model_result(model_name, df_train, df_train, select_column)

        model_name = 'elliptic_envelope'
        model_detail[i][model_name] = get_model_result(model_name, df_train, df_train, select_column)

        print( print_F1(model_detail[i]['isolation_forest']) )
        print( print_F1(model_detail[i]['local_outilier']) )
        print( print_F1(model_detail[i]['elliptic_envelope']) ) 
        
    return model_detail

## 3.2 Train Django logs with Hash Vectorizer

In [47]:
model_details_hash['django'] = train_hash(df_django_original, df_django_group_test)

7


The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df_result_group = df_result.groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
2022-08-18 16:24:26 INFO     isolation_forest
2022-08-18 16:24:26 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:24:26 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:24:26 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:24:26 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:24:26 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:24:26 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:24:26 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:24:27 INFO                parameter_resul

     F1 max : 0.6667, F1 of each hyper: [0.6667, 0.5, 0.3137, 0.2254, 0.1778, 0.1455, 0.1231, 0.1074, 0.0947, 0.0851]
None
     F1 max : nan, F1 of each hyper: [nan, 0.6667, 0.2, 0.1311, 0.1356]
None
     F1 max : 0.6667, F1 of each hyper: [0.6667, 0.16, 0.0851]
None
8


The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df_result_group = df_result.groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
2022-08-18 16:24:31 INFO     isolation_forest
2022-08-18 16:24:31 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:24:31 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:24:31 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:24:31 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:24:32 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:24:32 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:24:32 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:24:32 INFO                parameter_resul

     F1 max : 0.6667, F1 of each hyper: [0.6667, 0.5, 0.3137, 0.2254, 0.1778, 0.1455, 0.1231, 0.1074, 0.0947, 0.0851]
None
     F1 max : nan, F1 of each hyper: [nan, 0.6667, 0.2051, 0.1311, 0.1356]
None
     F1 max : 0.6667, F1 of each hyper: [0.6667, 0.16, 0.0847]
None
9


The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df_result_group = df_result.groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
2022-08-18 16:24:47 INFO     isolation_forest
2022-08-18 16:24:47 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:24:47 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:24:47 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:24:48 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:24:48 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:24:48 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:24:48 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:24:49 INFO                parameter_resul

     F1 max : 0.6667, F1 of each hyper: [0.6667, 0.5, 0.3137, 0.2254, 0.1778, 0.1455, 0.1231, 0.1074, 0.0947, 0.0851]
None
     F1 max : nan, F1 of each hyper: [nan, 0.6667, 0.2025, 0.1311, 0.1356]
None
     F1 max : 0.6667, F1 of each hyper: [0.6667, 0.16, 0.0851]
None
10


The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df_result_group = df_result.groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
2022-08-18 16:25:28 INFO     isolation_forest
2022-08-18 16:25:28 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:25:28 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:25:28 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:25:29 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:25:29 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:25:29 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:25:30 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:25:30 INFO                parameter_resul

     F1 max : 0.6667, F1 of each hyper: [0.6667, 0.5, 0.3137, 0.2254, 0.1778, 0.1455, 0.1231, 0.1074, 0.0947, 0.0851]
None
     F1 max : nan, F1 of each hyper: [nan, 0.6667, 0.2025, 0.1311, 0.1356]
None
     F1 max : 0.6667, F1 of each hyper: [0.6667, 0.16, 0.0847]
None


In [54]:
with open('output/model_details_hash_django.pickle', 'wb') as handle:
    pickle.dump(model_details_hash['django'], handle, protocol=pickle.HIGHEST_PROTOCOL)

## 3.3 Train Django logs with Hash Vectorizer

In [40]:
model_details_hash['nginx'] = train_hash(df_nginx_original, df_nginx_group_test)

7


The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df_result_group = df_result.groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
2022-08-18 16:11:17 INFO     isolation_forest
2022-08-18 16:11:17 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:11:17 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:11:17 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:11:17 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:11:17 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:11:17 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:11:17 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:11:17 INFO                parameter_resul

     F1 max : 0.4, F1 of each hyper: [0.4, 0.2286, 0.1481, 0.1622, 0.1935, 0.1593, 0.0902, 0.0921, 0.0698, 0.1152]
None
     F1 max : nan, F1 of each hyper: [nan, 0, 0, 0, 0]
None
     F1 max : 0.1047, F1 of each hyper: [0, 0.0583, 0.1047]
None
8


The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df_result_group = df_result.groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
2022-08-18 16:11:27 INFO     isolation_forest
2022-08-18 16:11:27 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:11:28 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:11:28 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:11:29 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:11:31 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:11:31 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:11:31 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:11:32 INFO                parameter_resul

     F1 max : 0.5333, F1 of each hyper: [0.5333, 0.2857, 0.1852, 0.1622, 0.1505, 0.1593, 0.1353, 0.0526, 0.0814, 0.0838]
None
     F1 max : nan, F1 of each hyper: [nan, 0, 0, 0, 0]
None
     F1 max : 0.2136, F1 of each hyper: [0, 0.2136, 0.1146]
None
9


The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df_result_group = df_result.groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
2022-08-18 16:11:50 INFO     isolation_forest
2022-08-18 16:11:50 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:11:50 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:11:50 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:11:51 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:11:52 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:11:52 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:11:53 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:11:53 INFO                parameter_resul

     F1 max : 0.5333, F1 of each hyper: [0.5333, 0.5143, 0.2593, 0.2703, 0.2366, 0.177, 0.1654, 0.1447, 0.1279, 0.1152]
None
     F1 max : nan, F1 of each hyper: [nan, 0, 0, 0, 0]
None
     F1 max : 0.5333, F1 of each hyper: [0.5333, 0.2136, 0.1152]
None
10


The new arguments that you should use are 'offset' or 'origin'.

>>> df.resample(freq="3s", base=2)

becomes:

>>> df.resample(freq="3s", offset="2s")

  df_result_group = df_result.groupby(pd.Grouper(freq='10S', base=0, label='right')).sum()
2022-08-18 16:12:33 INFO     isolation_forest
2022-08-18 16:12:33 INFO                parameter_result {'contamination': 0.01}
2022-08-18 16:12:33 INFO                parameter_result {'contamination': 0.06444444444444444}
2022-08-18 16:12:34 INFO                parameter_result {'contamination': 0.11888888888888888}
2022-08-18 16:12:34 INFO                parameter_result {'contamination': 0.17333333333333334}
2022-08-18 16:12:35 INFO                parameter_result {'contamination': 0.22777777777777777}
2022-08-18 16:12:35 INFO                parameter_result {'contamination': 0.2822222222222222}
2022-08-18 16:12:37 INFO                parameter_result {'contamination': 0.33666666666666667}
2022-08-18 16:12:39 INFO                parameter_resul

     F1 max : 0.5333, F1 of each hyper: [0.5333, 0.4571, 0.2963, 0.2703, 0.2366, 0.1947, 0.1654, 0.1447, 0.1279, 0.1152]
None
     F1 max : nan, F1 of each hyper: [nan, 0, 0, 0, 0]
None
     F1 max : 0.5333, F1 of each hyper: [0.5333, 0.2136, 0.1152]
None


In [43]:
with open('count/model_details_hash_nginx.pickle', 'wb') as handle:
    pickle.dump(model_details_hash['nginx'], handle, protocol=pickle.HIGHEST_PROTOCOL)

## 3.4  Find Max F1

In [21]:
print_F1(model_details_hash_nginx[9]['isolation_forest'])

     F1 max : 0.5333, F1 of each hyper: [0.5333, 0.5143, 0.2593, 0.2703, 0.2366, 0.177, 0.1654, 0.1447, 0.1279, 0.1152]


In [22]:
print_F1(model_details_hash_nginx[9]['elliptic_envelope'])

     F1 max : 0.5333, F1 of each hyper: [0.5333, 0.2136, 0.1152]


## Example Result

In [19]:
model_details_hash_nginx[9]['isolation_forest'][0]

{'parameter': {'contamination': 0.01},
 'metric_result': {'confusion_matrix': array([[350,   0],
         [  7,   4]]),
  'TN': 350,
  'FP': 0,
  'FN': 7,
  'TP': 4,
  'precision': 1.0,
  'recall': 0.36363636363636365,
  'specificity': 1.0,
  'accuracy': 0.9806094182825484,
  'F1': '0.5333'},
 'predict':                datetime           0    1    2    3    4    5    6          7  \
 0   2022-08-10 00:29:10   11.202614  0.0  0.0  0.0  0.0  0.0  0.0   1.800950   
 1   2022-08-10 00:29:20    0.000000  0.0  0.0  0.0  0.0  0.0  0.0   2.683282   
 2   2022-08-10 00:29:30    0.000000  0.0  0.0  0.0  0.0  0.0  0.0   1.747230   
 3   2022-08-10 00:29:40    0.000000  0.0  0.0  0.0  0.0  0.0  0.0  13.816438   
 4   2022-08-10 00:29:50    0.000000  0.0  0.0  0.0  0.0  0.0  0.0   2.683282   
 ..                  ...         ...  ...  ...  ...  ...  ...  ...        ...   
 356 2022-08-10 01:28:30  104.525431  0.0  0.0  0.0  0.0  0.0  0.0   3.785291   
 357 2022-08-10 01:28:40   76.912735  0.0  0.0 

In [20]:
model_details_hash_nginx[9]['elliptic_envelope'][0]

{'parameter': {'contamination': 0.01},
 'metric_result': {'confusion_matrix': array([[350,   0],
         [  7,   4]]),
  'TN': 350,
  'FP': 0,
  'FN': 7,
  'TP': 4,
  'precision': 1.0,
  'recall': 0.36363636363636365,
  'specificity': 1.0,
  'accuracy': 0.9806094182825484,
  'F1': '0.5333'},
 'predict':                datetime           0    1    2    3    4    5    6          7  \
 0   2022-08-10 00:29:10   11.202614  0.0  0.0  0.0  0.0  0.0  0.0   1.800950   
 1   2022-08-10 00:29:20    0.000000  0.0  0.0  0.0  0.0  0.0  0.0   2.683282   
 2   2022-08-10 00:29:30    0.000000  0.0  0.0  0.0  0.0  0.0  0.0   1.747230   
 3   2022-08-10 00:29:40    0.000000  0.0  0.0  0.0  0.0  0.0  0.0  13.816438   
 4   2022-08-10 00:29:50    0.000000  0.0  0.0  0.0  0.0  0.0  0.0   2.683282   
 ..                  ...         ...  ...  ...  ...  ...  ...  ...        ...   
 356 2022-08-10 01:28:30  104.525431  0.0  0.0  0.0  0.0  0.0  0.0   3.785291   
 357 2022-08-10 01:28:40   76.912735  0.0  0.0 