In [5]:
import pandas as pd
import os
import numpy as np
import re
from sklearn.utils import shuffle
from collections import OrderedDict, Counter
from sklearn import tree, svm
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

# Reading Data

In [7]:
# Read the structured HDFS log file
struct_log = pd.read_csv('../data/HDFS_100k.log_structured.csv', engine='c', na_filter=False, memory_map=True)

# Convert it into a dictionary containing a set of events happening in a block (of file system)
# with blockId as the key and set of events as value
data_dict = OrderedDict()
for idx, row in struct_log.iterrows():
    # Finding block ids in every log using regular expression
    blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
    blkId_set = set(blkId_list)
    for blk_Id in blkId_set:
        if not blk_Id in data_dict:
            data_dict[blk_Id] = []
        # Creating a sequence of events happening over an HDFS block
        data_dict[blk_Id].append(row['EventId'])

# Creating the final DataFrame that we will work on
data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])

In [8]:
data_df.head(5)

Unnamed: 0,BlockId,EventSequence
0,blk_-1608999687919862906,"[E5, E22, E5, E5, E11, E11, E9, E9, E11, E9, E..."
1,blk_7503483334202473044,"[E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E..."
2,blk_-3544583377289625738,"[E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E..."
3,blk_-9073992586687739851,"[E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E..."
4,blk_7854771516489510256,"[E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E..."


In [9]:
# Reading anomaly labels
label_data = pd.read_csv('../data/anomaly_label.csv', engine='c', na_filter=False, memory_map=True)

# Using BlockIds as keys
label_data = label_data.set_index('BlockId')

# Creating a new attribute Label in the data frame (dependent variable)
label_dict = label_data['Label'].to_dict()
data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)

In [10]:
data_df.head(5)

Unnamed: 0,BlockId,EventSequence,Label
0,blk_-1608999687919862906,"[E5, E22, E5, E5, E11, E11, E9, E9, E11, E9, E...",0
1,blk_7503483334202473044,"[E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...",0
2,blk_-3544583377289625738,"[E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...",1
3,blk_-9073992586687739851,"[E5, E22, E5, E5, E11, E9, E11, E9, E11, E9, E...",0
4,blk_7854771516489510256,"[E5, E5, E22, E5, E11, E9, E11, E9, E11, E9, E...",0


# Splitting into train and test

In [12]:
# Splitting the data into train and test set {train: 70%, test: 30%}
yValues = data_df['Label'].values
xValues = data_df['EventSequence'].values

# Splitting normal and anomalous blocks
# Indices of normal logs
pos_idx = yValues > 0
x_pos = xValues[pos_idx]
y_pos = yValues[pos_idx]
# DataFrames with events of block with anomaly
x_neg = xValues[~pos_idx]
y_neg = yValues[~pos_idx]
train_pos = int(0.7 * x_pos.shape[0])
train_neg = int(0.7 * x_neg.shape[0])

# Splitting into train and test set
x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])

# Random shuffle since DataFrame has normal logs and anomalous logs separated
indexes = shuffle(np.arange(x_train.shape[0]))
x_train = x_train[indexes]
y_train = y_train[indexes]

In [13]:
print(x_train, y_train)

[list(['E5', 'E22', 'E5', 'E5', 'E11', 'E11', 'E9', 'E9', 'E11', 'E9', 'E26', 'E26', 'E26'])
 list(['E22', 'E5', 'E5', 'E5', 'E26', 'E26', 'E11', 'E9', 'E11', 'E9', 'E26', 'E11', 'E9'])
 list(['E22', 'E5', 'E5', 'E5', 'E11', 'E9', 'E11', 'E9', 'E26', 'E26', 'E26', 'E11', 'E9'])
 ...
 list(['E5', 'E22', 'E5', 'E5', 'E11', 'E9', 'E11', 'E9', 'E11', 'E9', 'E26', 'E26', 'E26', 'E2', 'E2'])
 list(['E22', 'E5', 'E5', 'E5', 'E26', 'E26', 'E26', 'E11', 'E9', 'E11', 'E9', 'E11', 'E9', 'E2'])
 list(['E22', 'E5', 'E5', 'E5', 'E26', 'E26', 'E26', 'E11', 'E9', 'E11', 'E9', 'E11', 'E9'])] [0 0 0 ... 0 0 0]


# Preprocessing and Feature Extraction

In [14]:
# Transforming x_train into Tf-Idf vectors
X_counts = []
# For every block, count each event
for i in range(x_train.shape[0]):
    event_counts = Counter(x_train[i])
    X_counts.append(event_counts)

# Create a DataFrame with event count dictionary
X_df = pd.DataFrame(X_counts)
# Fill events that did not occur on the block with 0
X_df = X_df.fillna(0)
events = X_df.columns
X = X_df.values

# Vectorizing using TF-IDF vectors
num_instance, num_event = X.shape
df_vec = np.sum(X > 0, axis=0)
idf_vec = np.log(num_instance / (df_vec + 1e-8))
x_train = X * np.tile(idf_vec, (num_instance, 1)) 

print('Train data shape: {}-by-{}\n'.format(x_train.shape[0], x_train.shape[1]))

Train data shape: 5557-by-16



In [15]:
# Similar steps followed for Testing set
X_counts = []
for i in range(x_test.shape[0]):
    event_counts = Counter(x_test[i])
    X_counts.append(event_counts)
X_df = pd.DataFrame(X_counts)
X_df = X_df.fillna(0)

empty_events = set(events) - set(X_df.columns)
for event in empty_events:
    X_df[event] = [0] * len(X_df)
X = X_df[events].values

num_instance, num_event = X.shape
x_test = X * np.tile(idf_vec, (num_instance, 1)) 


print('Test data shape: {}-by-{}\n'.format(x_test.shape[0], x_test.shape[1]))

Test data shape: 2383-by-16



In [16]:
print(x_test)

[[-7.19824200e-12 -1.79956050e-12  4.73393336e-02 ...  7.01337576e+00
   7.23651931e+00  0.00000000e+00]
 [-7.19824200e-12 -1.79956050e-12  4.73393336e-02 ...  7.01337576e+00
   7.23651931e+00  0.00000000e+00]
 [-7.19824200e-12 -1.79956050e-12  4.73393336e-02 ...  7.01337576e+00
   7.23651931e+00  0.00000000e+00]
 ...
 [-5.39868150e-12 -1.79956050e-12  4.73393336e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-5.39868150e-12 -1.79956050e-12  4.73393336e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-5.39868150e-12 -1.79956050e-12  4.73393336e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


# Decision Tree Classification

In [17]:
classifier = tree.DecisionTreeClassifier()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(X)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))

Precision: 0.972, recall: 0.372, F1-measure: 0.538



# Logistic Regression

In [18]:
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))

Precision: 0.953, recall: 0.436, F1-measure: 0.599



# SVM


In [19]:
classifier = svm.LinearSVC()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))

Precision: 0.953, recall: 0.436, F1-measure: 0.599



