In [61]:
import numpy as np
import pandas as pd

NumExpr defaulting to 8 threads.


### mining log template model

In [60]:
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
import json
import logging
import os
import sys
import time
from os.path import dirname

def mineModel(f, cfg):
    config = TemplateMinerConfig()
    config.load(cfg)
    config.profiling_enabled = True
    template_miner = TemplateMiner(config=config)   
    lines = f.readlines()              

    line_count = 0
    start_time = time.time()
    batch_start_time = start_time
    batch_size = 10000
    logger = logging.getLogger(__name__)
    logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')

    for line in lines:
        line = line.rstrip()
        line = line.partition(": ")[2]
        result = template_miner.add_log_message(line)
        line_count += 1
        if line_count % batch_size == 0:
            time_took = time.time() - batch_start_time
            rate = batch_size / time_took
            logger.info(f"Processing line: {line_count}, rate {rate:.1f} lines/sec, "
                        f"{len(template_miner.drain.clusters)} clusters so far.")
            batch_start_time = time.time()
        if result["change_type"] != "none":
            result_json = json.dumps(result)
            logger.info(f"Input ({line_count}): " + line)
            logger.info("Result: " + result_json)

    time_took = time.time() - start_time
    rate = line_count / time_took
    logger.info(f"--- Done processing file in {time_took:.2f} sec. Total of {line_count} lines, rate {rate:.1f} lines/sec, "
                f"{len(template_miner.drain.clusters)} clusters")

    sorted_clusters = sorted(template_miner.drain.clusters, key=lambda it: it.size, reverse=True)
    for cluster in sorted_clusters:
        logger.info(cluster)
    
    return template_miner

### Parse log files from Hadoop

#### Move all files to a single directory

In [2]:
import os
import shutil

source_path = os.path.abspath(r'/Users/shuming/Downloads/Hadoop')     # source directory
target_path = os.path.abspath(r'/Users/shuming/Downloads/merged_hadoop/')    # target directory

if not os.path.exists(target_path):     # creat target directory if it doesn't exist 
    os.makedirs(target_path)

if os.path.exists(source_path):    
    
    for root, dirs, files in os.walk(source_path):
        for file in files:
            src_file = os.path.join(root, file)
            shutil.copy(src_file, target_path)
            print(src_file)

print('copy complete')

/Users/shuming/Downloads/Hadoop/abnormal_label.txt
/Users/shuming/Downloads/Hadoop/application_1445087491445_0005/container_1445087491445_0005_01_000007.log
/Users/shuming/Downloads/Hadoop/application_1445087491445_0005/container_1445087491445_0005_01_000013.log
/Users/shuming/Downloads/Hadoop/application_1445087491445_0005/container_1445087491445_0005_01_000012.log
/Users/shuming/Downloads/Hadoop/application_1445087491445_0005/container_1445087491445_0005_01_000006.log
/Users/shuming/Downloads/Hadoop/application_1445087491445_0005/container_1445087491445_0005_01_000010.log
/Users/shuming/Downloads/Hadoop/application_1445087491445_0005/container_1445087491445_0005_01_000004.log
/Users/shuming/Downloads/Hadoop/application_1445087491445_0005/container_1445087491445_0005_01_000005.log
/Users/shuming/Downloads/Hadoop/application_1445087491445_0005/container_1445087491445_0005_01_000011.log
/Users/shuming/Downloads/Hadoop/application_1445087491445_0005/container_1445087491445_0005_01_000015

#### Merged all files into one log file

In [4]:
#os module consists of many functions about file and directory processing
import os  
meragefiledir = '/Users/shuming/Downloads/merged_hadoop/'
filenames=os.listdir(meragefiledir)  
# target file path
file=open('/Users/shuming/Downloads/merged_hadoop/merged.log','w')  
   
for filename in filenames:  
    if(filename != 'merged.log'):
        filepath=meragefiledir+filename    
        for line in open(filepath):  
            file.writelines(line)  
        file.write('\n')  
 
file.close()

#### Train Drain3 model with existing logs

In [18]:
template_miner = mineModel(f = open("/Users/shuming/Downloads/merged_hadoop/merged.log"),cfg='../data/drain3.ini')

Input (1): loaded properties from hadoop-metrics2.properties
Result: {"change_type": "cluster_created", "cluster_id": 1, "cluster_size": 1, "template_mined": "loaded properties from hadoop-metrics2.properties", "cluster_count": 1}
Input (2): Scheduled snapshot period at 10 second(s).
Result: {"change_type": "cluster_created", "cluster_id": 2, "cluster_size": 1, "template_mined": "Scheduled snapshot period at <:NUM:> second(s).", "cluster_count": 2}
Input (3): MapTask metrics system started
Result: {"change_type": "cluster_created", "cluster_id": 3, "cluster_size": 1, "template_mined": "MapTask metrics system started", "cluster_count": 3}
Input (4): Executing with tokens:
Result: {"change_type": "cluster_created", "cluster_id": 4, "cluster_size": 1, "template_mined": "Executing with tokens:", "cluster_count": 4}
Input (5): Kind: mapreduce.job, Service: job_1445076437777_0005, Ident: (org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier@666adef3)
Result: {"change_type": "cluster

True

#### Get templates

In [34]:
normal_log = ['1445087491445_0005','1445087491445_0007', 
'1445175094696_0005','1445062781478_0011','1445062781478_0016','1445062781478_0019'
'1445076437777_0002','1445076437777_0005','1445144423722_0021','1445144423722_0024'
'1445182159119_0012']

In [37]:
label = []
events = []
identifier = []

# open files one by one
meragefiledir = '/Users/shuming/Downloads/merged_hadoop/'
filenames=os.listdir(meragefiledir)  
   
for filename in filenames: 
    sequence = []
    filepath=meragefiledir+filename   
    if filename[10:28] in normal_log:
        label.append(0)
    else:
        label.append(1) 
    for line in open(filepath):     
        match = template_miner.match(line)
        sequence.append('E'+match.cluster_id)
    identifier(filename)



Read from history task task <:NUM:> <:NUM:> m <:NUM:>
47
1445076437777_0005
1445087491445_0005
1445094324383_0003
1445087491445_0005
1445076437777_0005
1445087491445_0007
1445094324383_0002
1445087491445_0004
1445087491445_0004
1445087491445_0010
1445094324383_0001
1445087491445_0004
1445094324383_0001
1445182159119_0011
1445062781478_0019
1445087491445_0006
1445182159119_0011
1445087491445_0001
1445144423722_0022
1445182159119_0016
1445076437777_0001
1445076437777_0001
1445144423722_0022
1445087491445_0001
1445182159119_0002
1445182159119_0002
1445182159119_0014
1445144423722_0020
1445094324383_0005
1445144423722_0023
1445144423722_0023
1445094324383_0005
1445087491445_0003
1445087491445_0002
1445076437777_0002
1445182159119_0015
1445182159119_0015
1445094324383_0004
1445087491445_0002
1445062781478_0020
1445182159119_0001
1445062781478_0020
1445062781478_0020
1445076437777_0002
1445144423722_0021
1445182159119_0015
1445087491445_0002
1445094324383_0004
1445087491445_0002
144518215911

### Parse Log files from HDFS
#### Train model

In [58]:
mine_template = mineModel(f = open("../DATA/HDFS.log"),cfg='../data/drain3_hdfs.ini')

Starting Drain3 template miner
Input (1): Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010
Result: {"change_type": "cluster_created", "cluster_id": 1, "cluster_size": 1, "template_mined": "Receiving block <:BLOCKID:> src: /10.250.19.102:54106 dest: /10.250.19.102:50010", "cluster_count": 1}
Input (2): BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar. blk_-1608999687919862906
Result: {"change_type": "cluster_created", "cluster_id": 2, "cluster_size": 1, "template_mined": "BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job 200811092030 0001/job.jar. <:BLOCKID:>", "cluster_count": 2}
Input (3): Receiving block blk_-1608999687919862906 src: /10.250.10.6:40524 dest: /10.250.10.6:50010
Result: {"change_type": "cluster_template_changed", "cluster_id": 1, "cluster_size": 2, "template_mined": "Receiving block <:BLOCKID:> src: <:*:> dest: <:*:>", "cluster_count": 2}
Input (5): PacketResponder 1 for b

#### Get Templates

In [62]:
event = []
templates = []
identifier = []

f = open("../DATA/HDFS.log")
lines = f.readlines()     

for line in lines:   
   line = line.partition(': ')[2]
   match = mine_template.match(line)
   event.append('E'+ str(match.cluster_id))
   templates.append(match.get_template())
   ps = template_miner.extract_parameters(match.get_template(),line,False)
   for p in ps:
      if p.mask_name == 'BLOCKID':
         identifier.append(p.value)
         break

df = pd.DataFrame({'event': event,'template': templates,'identifier':identifier})
df.to_csv('../data/hdfs_templates.csv')
   

In [39]:
config = TemplateMinerConfig()
config.load("../data/drain3.ini")
config.profiling_enabled = True
template_miner = TemplateMiner(config=config)   

log_line = "Receiving block blk_-1608999687919862906 src: /10.250.19.102:54106 dest: /10.250.19.102:50010"

result = template_miner.add_log_message(log_line)

match = template_miner.match(log_line)

Starting Drain3 template miner


In [47]:
print(match)
template_miner.get_parameter_list(match.get_template(),log_line)

ID=1     : size=1         : Receiving block <:BLOCKID:> src: /<:IP:>:<:NUM:> dest: /<:IP:>:<:NUM:>


['blk -1608999687919862906',
 '10.250.19.102',
 '54106',
 '10.250.19.102',
 '50010']

In [59]:
ps = template_miner.extract_parameters(match.get_template(),log_line,False)
for p in ps:
    if p.mask_name == 'BLOCKID':
        print(p.value)
        break

blk -1608999687919862906
