All the datasets were downloaded from [LogHub's Github page](https://github.com/logpai/loghub)

In [None]:
import pandas as pd
import re
import os
import random

In [None]:
def log_to_dataframe(log_file, regex, headers):
    """ Function to transform log file to dataframe
    """
    log_messages = []
    with open(log_file, 'r', errors='ignore') as fin:
        for line in fin.readlines():
            try:
                match = regex.search(line.strip())
                message = [match.group(header) for header in headers]
                log_messages.append(message)
            except Exception as e:
                # print("\n", line)
                # print(e)
                pass
    logdf = pd.DataFrame(log_messages, columns=headers)
    return logdf

In [None]:
def generate_logformat_regex(logformat):
    """ Function to generate regular expression to split log messages
    """
    headers = []
    splitters = re.split(r'(<[^<>]+>)', logformat)
    regex = ''
    for k in range(len(splitters)):
        if k % 2 == 0:
            splitter = re.sub(' +', '\\\s+', splitters[k])
            regex += splitter
        else:
            header = splitters[k].strip('<').strip('>')
            regex += '(?P<%s>.*?)' % header
            headers.append(header)
    regex = re.compile('^' + regex + '$')
    return headers, regex

In [None]:
def load_data(path, logName, log_format):
    headers, regex = generate_logformat_regex(log_format)
    df_log = log_to_dataframe(os.path.join(path, logName), regex, headers)
    return df_log

In [None]:
def sample(df: pd.DataFrame, k: int):
    nrows = range(df.shape[0])
    ix = random.randint(nrows.start, nrows.stop - k)
    return df.iloc[ix:(ix + k), :].reset_index(drop=True)

In [None]:
data_path = '' # Path of the input *.log files and output *.csv files

# BGL

In [None]:
bgl_format = '<Label> <Id> <Date> <Code1> <timestamp> <Code2> <Component1> <Component2> <Level> <Payload>'
bgl_df = load_data(data_path, 'BGL.log', bgl_format)

# Select sequence of size k randomly
bgl_df = sample(bgl_df, 2000)
bgl_df.head()

In [None]:
# Transform timestamps
bgl_ts_format = "%Y-%m-%d-%H.%M.%S.%f"
bgl_df['timestamp'] = pd.to_datetime(bgl_df['timestamp'], format=bgl_ts_format)

# Transform labels to binary
bgl_df['Label'] = bgl_df['Label'].apply(lambda x: '0' if x == '-' else '1')
bgl_df.head()

In [None]:
bgl_df.groupby(['Label'])['Label'].count().rename({'1': 'Anomaly', '0': 'Normal'})

In [None]:
bgl_df.to_csv(os.path.join(data_path, 'BGL_2k.csv'), index=False)

# HDFS

In [None]:
hdfs_format = '<Date> <Time> <Pid> <Level> <Component>: <Payload>'
hdfs_df = load_data(data_path, 'HDFS.log', hdfs_format)

# Select sequence of size k randomly
hdfs_df = sample(hdfs_df, 2000)
hdfs_df.head()

In [None]:
# Since date and time are separate, we need to combine them into a timestamp
hdfs_df['timestamp'] = hdfs_df['Date'] + '-' + hdfs_df['Time']
hdfs_ts_format = '%y%m%d-%H%M%S'
hdfs_df['timestamp'] = pd.to_datetime(hdfs_df['timestamp'], format=hdfs_ts_format)

# Label information is also missing so we need add that
anomaly_labels = pd.read_csv(os.path.join(data_path, 'anomaly_label.csv'))
def anomaly_classification(payload):
    blkId_list = re.findall(r'(blk_-?\d+)', payload)
    blkId_set = list(set(blkId_list))
    if len(blkId_set) != 1: # This shouldn't happen
        raise ValueError(f"Row  has {len(blkId_set)} blkIds. Cannot determine if anomaly or not")
    blkId = blkId_set[0]
    is_anomaly = anomaly_labels.loc[anomaly_labels['BlockId'] == blkId, 'Label'].tolist()[0] == 'Anomaly'
    return '1' if is_anomaly else '0'

hdfs_df['Label'] = hdfs_df['Payload'].apply(anomaly_classification)
hdfs_df.head()

In [None]:
hdfs_df.groupby(['Label'])['Label'].count().rename({'1': 'Anomaly', '0': 'Normal'})

In [None]:
hdfs_df.to_csv(os.path.join(data_path, 'HDFS_2k.csv'), index=False)

# Mozilla Thunderbird

In [None]:
tbird_format = '<Label> <Id> <Date> <Admin> <Month> <Day> <Time> <AdminAddr> <Payload>'

# The Thunderbird log file is really huge so I'm just going to use the Github's
# 2k sample
tbird_df = load_data(data_path, 'Thunderbird_2k.log', tbird_format)

# Select sequence of size k randomly
# tbird_df = sample(tbird_df, 2000)
tbird_df.head()

In [None]:
# Transform timestamps
tbird_df['timestamp'] = tbird_df['Date'] + '-' + tbird_df['Time']
tbird_ts_format = '%Y.%m.%d-%H:%M:%S'
tbird_df['timestamp'] = pd.to_datetime(tbird_df['timestamp'], format=tbird_ts_format)

# Transform labels
tbird_df['Label'] = tbird_df['Label'].apply(lambda x: '0' if x == '-' else '1')
tbird_df.head()

In [None]:
tbird_df.groupby(['Label'])['Label'].count().rename({'1': 'Anomaly', '0': 'Normal'})

In [None]:
tbird_df.to_csv(os.path.join(data_path, 'Thunderbird_2k.csv'), index=False)