In [72]:
"""
Description : This file implements the Spell algorithm for log parsing
Author      : LogPAI team
License     : MIT
"""

import re
import os
import sys
import numpy as np
import pandas as pd
import hashlib
from datetime import datetime


class LCSObject:
    """ Class object to store a log group with the same template
    """
    def __init__(self, logTemplate='', logIDL=[]):
        #print('J')
        self.logTemplate = logTemplate
        self.logIDL = logIDL
        #print('logTemplate', logTemplate)
        #print('logIDL',logIDL)
        #print('back to A')


class Node:
    """ A node in prefix tree data structure
    """
    def __init__(self, token='', templateNo=0):
        #print('E')
        self.logClust = None
        self.token = token
        self.templateNo = templateNo
        self.childD = dict()
        self.makeList = dict()


class LogParser:
    """ LogParser class

    Attributes
    ----------
        path : the path of the input file
        logName : the file name of the input file
        savePath : the path of the output file
        tau : how much percentage of tokens matched to merge a log message
    """
    def __init__(self, indir='./', outdir='./result/', log_format=None, tau=0.5, rex=[], makeParamList=[]):
        #print('3')
        self.path = indir
        self.logName = None
        self.savePath = outdir
        self.tau = tau
        self.logformat = log_format
        self.df_log = None
        self.rex = rex
        self.makeParamList= makeParamList
    
    def LCS(self, seq1, seq2):
        #print('4')
        lengths = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]
        # row 0 and column 0 are initialized to 0 already
        for i in range(len(seq1)):
            for j in range(len(seq2)):
                if seq1[i] == seq2[j]:
                    lengths[i+1][j+1] = lengths[i][j] + 1
                else:
                    lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])

        # read the substring out from the matrix
        result = []
        lenOfSeq1, lenOfSeq2 = len(seq1), len(seq2)
        while lenOfSeq1!=0 and lenOfSeq2 != 0:
            if lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1-1][lenOfSeq2]:
                lenOfSeq1 -= 1
            elif lengths[lenOfSeq1][lenOfSeq2] == lengths[lenOfSeq1][lenOfSeq2-1]:
                lenOfSeq2 -= 1
            else:
                assert seq1[lenOfSeq1-1] == seq2[lenOfSeq2-1]
                result.insert(0,seq1[lenOfSeq1-1])
                lenOfSeq1 -= 1
                lenOfSeq2 -= 1
        #print('LCS Result',result)

        #print('back to A')
        return result


    def getTemplate(self, lcs, seq):
        #print('8')
        print('lcs',lcs, '\n','seq',seq)
        retVal = []
        param = []
        if not lcs:
            return retVal

        lcs = lcs[::-1]
        i = 0
        for token in seq:
            i += 1
            if token == lcs[-1]:
                retVal.append(token)
                #print('token',token, '--', 'retVal',retVal)
                lcs.pop()
            else:
                param.append(token)
                retVal.append('*')
            if not lcs:
                break
        if i < len(seq):
            retVal.append('*')
        #print('back to A')
        print('parameters:', param)
        print('retVal',retVal)
       
        return retVal, param
    

        


    def removeSeqFromPrefixTree(self, rootn, newCluster):
        #print('from 8')
        #print('10')
        parentn = rootn
        seq = newCluster.logTemplate
        seq = [w for w in seq if w != '*']

        for tokenInSeq in seq:
            if tokenInSeq in parentn.childD:
                matchedNode = parentn.childD[tokenInSeq]
                if matchedNode.templateNo == 1:
                    del parentn.childD[tokenInSeq]
                    break
                else:
                    matchedNode.templateNo -= 1
                    parentn = matchedNode




    def printTree(self, node, dep):
        #print('12')
        pStr = ''
        for i in range(dep):
            pStr += '\t'

        if node.token == '':
            pStr += 'Root'
        else:
            pStr += node.token
            if node.logClust is not None:
                pStr += '-->' + ' '.join(node.logClust.logTemplate)
        #print((pStr +' ('+ str(node.templateNo) + ')'))

        for child in node.childD:
            self.printTree(node.childD[child], dep + 1)


    def parse(self, logname):
        #print('A')
        starttime = datetime.now()
        #print(('Parsing file: ' + os.path.join(self.path, logname)))
        self.logname = logname
        self.load_data()
        #print('to B')
        rootNode = Node()
        logCluL = []

        count = 0
        for idx, line in self.df_log.iterrows():
            logID = line['LineId']
            #print('logid',logID)
            #print('to F')             
            logmessageL = [x for x in re.split(r'[\s=:,]', self.preprocess(line['Content'])) if x != '']
            #print('logmessageL',logmessageL)
            constLogMessL = [w for w in logmessageL if w != '*']
            #print('constLogMessL',constLogMessL)

            #Find an existing matched log cluster
            matchCluster = self.PrefixTreeMatch(rootNode, constLogMessL, 0)
            #print('to G')

            if matchCluster is None:
                #print('to H')
                matchCluster = self.SimpleLoopMatch(logCluL, constLogMessL)
                #print('matchCluster1',matchCluster)

                if matchCluster is None:
                    #print('to I')
                    matchCluster = self.LCSMatch(logCluL, logmessageL)
                    #print('matchCluster2',matchCluster)

                    # Match no existing log cluster
                    if matchCluster is None:
                        #print('to J')
                        newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID])
                        logCluL.append(newCluster)
                        self.addSeqToPrefixTree(rootNode, newCluster)
                    #Add the new log message to the existing cluster
                    else:
                        #print('to 4 and then 8')
                        newTemplate, param = self.getTemplate(self.LCS(logmessageL, matchCluster.logTemplate),
                                                       matchCluster.logTemplate)
                        makeParamList.append(param)
                        if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate):
                            self.removeSeqFromPrefixTree(rootNode, matchCluster)
                            matchCluster.logTemplate = newTemplate
                            self.addSeqToPrefixTree(rootNode, matchCluster)
            if matchCluster:
                matchCluster.logIDL.append(logID)
                #print('matchCluster.logIDL.append(logID)',matchCluster.logIDL.append(logID))
            count += 1
            if count % 1000 == 0 or count == len(self.df_log):
                print(('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log))))
                #print('testing')
                #print('makeParamList___:',makeParamList)
        if not os.path.exists(self.savePath):
            os.makedirs(self.savePath)

        self.outputResult(logCluL,makeParamList)
        print(('Parsing done. [Time taken: {!s}]'.format(datetime.now() - starttime)))

    def load_data(self):
        #print('B')
        #print('to C')
        headers, regex = self.generate_logformat_regex(self.logformat)
        
        #print('headers', headers,'regex',  regex
        
        self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logname), regex, headers, self.logformat)
        #print('self.df_log', self.df_log)
        #print('Back to A')

    def generate_logformat_regex(self, logformat):
        #print('C')
        """ Function to generate regular expression to split log messages
        """
        headers = []
        splitters = re.split(r'(<[^<>]+>)', logformat)
        regex = ''
        for k in range(len(splitters)):
            if k % 2 == 0:
                splitter = re.sub(' +', '\s+', splitters[k])
                regex += splitter
            else:
                header = splitters[k].strip('<').strip('>')
                regex += '(?P<%s>.*?)' % header
                headers.append(header)
        regex = re.compile('^' + regex + '$')
        return headers, regex

    def log_to_dataframe(self, log_file, regex, headers, logformat):
        #print('D')
        """ Function to transform log file to dataframe
        """
        log_messages = []
        linecount = 0
        with open(log_file, 'r') as fin:
            for line in fin.readlines():
                line = re.sub(r'[^\x00-\x7F]+', '<NASCII>', line)
                try:
                    match = regex.search(line.strip())
                    message = [match.group(header) for header in headers]
                    log_messages.append(message)
                    linecount += 1
                except Exception as e:
                    pass
        logdf = pd.DataFrame(log_messages, columns=headers)
        logdf.insert(0, 'LineId', None)
        logdf['LineId'] = [i + 1 for i in range(linecount)]
        #print('back to B')
        #print('logdf',logdf)
        return logdf

    def preprocess(self, line):
        #print('F')
        for currentRex in self.rex:
            #print('currentRex', currentRex)
            line = re.sub(currentRex, '*', line)
            #print('line', line)
        #print('Back to A')
        return line


    def PrefixTreeMatch(self, parentn, seq, idx):
        #print('G')
        retLogClust = None
        length = len(seq)
        for i in range(idx, length):
            if seq[i] in parentn.childD:
                childn = parentn.childD[seq[i]]
                #print("Childn:", childn)
                if (childn.logClust is not None):
                    constLM = [w for w in childn.logClust.logTemplate if w != '*']
                    #print("ConstLM:",constLM)
                    if float(len(constLM)) >= self.tau * length:
                        #print('childn.logClust', childn.logClust)
                        return childn.logClust
                else:
                    #print(self.PrefixTreeMatch(childn, seq, i + 1))
                    return self.PrefixTreeMatch(childn, seq, i + 1)
        #print('Back to A')
        #print('retLogClust',retLogClust)
        return retLogClust


    def SimpleLoopMatch(self, logClustL, seq):
        #print('H')
        retLogClust = None

        for logClust in logClustL:
            if float(len(logClust.logTemplate)) < 0.5 * len(seq):
                continue

            #If the template is a subsequence of seq
            it = iter(seq)
            if all(token in seq or token == '*' for token in logClust.logTemplate):
                return logClust
        #print('Back to A')
        return retLogClust


    def LCSMatch(self, logClustL, seq):
        #print('I')
        retLogClust = None

        maxLen = -1
        maxlcs = []
        maxClust = None
        set_seq = set(seq)
        size_seq = len(seq)
        for logClust in logClustL:
            set_template = set(logClust.logTemplate)
            if len(set_seq & set_template) < 0.5 * size_seq:
                continue
            lcs = self.LCS(seq, logClust.logTemplate)
            if len(lcs) > maxLen or (len(lcs) == maxLen and len(logClust.logTemplate) < len(maxClust.logTemplate)):
                maxLen = len(lcs)
                maxlcs = lcs
                maxClust = logClust

        # LCS should be large then tau * len(itself)
        if float(maxLen) >= self.tau * size_seq:
            retLogClust = maxClust
        #print('Back to A')
        return retLogClust

    def addSeqToPrefixTree(self, rootn, newCluster):
        #print('K')
        parentn = rootn
        seq = newCluster.logTemplate
        seq = [w for w in seq if w != '*']

        for i in range(len(seq)):
            tokenInSeq = seq[i]
            # Match
            if tokenInSeq in parentn.childD:
                parentn.childD[tokenInSeq].templateNo += 1
            # Do not Match
            else:
                parentn.childD[tokenInSeq] = Node(token=tokenInSeq, templateNo=1)
            parentn = parentn.childD[tokenInSeq]

        if parentn.logClust is None:
            parentn.logClust = newCluster

    
    def outputResult(self, logClustL, makeParamList):
        #print('Z')
        #print('Parameters:',makeParamList)
        templates = [0] * self.df_log.shape[0]
        ids = [0] * self.df_log.shape[0]
        df_event = []

        for logclust in logClustL:
            template_str = ' '.join(logclust.logTemplate)
            eid = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
            for logid in logclust.logIDL:
                templates[logid - 1] = template_str
                ids[logid - 1] = eid
            df_event.append([eid, template_str, len(logclust.logIDL)])

        df_event = pd.DataFrame(df_event, columns=['EventId', 'EventTemplate', 'Occurrences'])

        self.df_log['EventId'] = ids
        self.df_log['EventTemplate'] = templates
        self.df_log.to_csv(os.path.join(self.savePath, self.logname + '_structured.csv'), index=False)
        df_event.to_csv(os.path.join(self.savePath, self.logname + '_templates.csv'), index=False)
        #dfP= pd.DataFrame(makeParamList)
        #dfP.to_csv(os.path.join(self.savePath, self.makeParamList ), index=False)
        #dfP.to_pickle('makeParamList.pkl') 



In [73]:
input_dir  = '/Users/vikrant/jupyternotebooks/'  # The input directory of log file
output_dir = 'Spell_result/11/'  # The output directory of parsing results
log_file   = '1k.csv'  # The input log file name
log_format = '<Time> <Machine> <Daemon> <Content>'  # HDFS log format
tau        = 0.6  # Message type threshold (default: 0.5)
regex      = []  # Regular expression list for optional preprocessing (default: [])
makeParamList =[]

In [74]:
#import Spell
import sys
sys.path.append('../')
parser =LogParser( makeParamList=makeParamList,indir=input_dir,outdir=output_dir, log_format=log_format, tau=tau, rex=regex,)


In [75]:
parser.parse(log_file)

lcs ['a4', 'ba', 'db', 'via', 'network', 'no', 'free', 'leases"'] 
 seq ['a4', 'ba', 'db', '84', '1d', '63', 'via', 'eth0', 'network', '206.76.192/20', 'no', 'free', 'leases"']
parameters: ['84', '1d', '63', 'eth0', '206.76.192/20']
retVal ['a4', 'ba', 'db', '*', '*', '*', 'via', '*', 'network', '*', 'no', 'free', 'leases"']
lcs ['repeated', 'times"'] 
 seq ['repeated', '2', 'times"']
parameters: ['2']
retVal ['repeated', '*', 'times"']
lcs ['206.76.199.180', '00', '26', 'b9', 'fe', '52', '30', 'via', 'eth0"'] 
 seq ['206.76.199.180', 'from', '00', '26', 'b9', 'fe', '52', '30', 'via', 'eth0"']
parameters: ['from']
retVal ['206.76.199.180', '*', '00', '26', 'b9', 'fe', '52', '30', 'via', 'eth0"']
lcs ['eth0', 'to', '10.4.0.1', 'port', '67', '(xid'] 
 seq ['eth0', 'to', '10.4.0.1', 'port', '67', '(xid', '0x6e318336)"']
parameters: []
retVal ['eth0', 'to', '10.4.0.1', 'port', '67', '(xid', '*']
lcs ['10.4.192.252', 'a4', 'ba', 'db', '01', '67', 'a2', 'via', 'eth2"'] 
 seq ['10.4.192.252',

In [None]:
parser.parse(log_file)

In [25]:
parser.parse(log_file)

Parsing file: /Users/vikrant/jupyternotebooks/10.csv
headers ['Time', 'Machine', 'Daemon', 'Content'] regex re.compile('^(?P<Time>.*?)\\s+(?P<Machine>.*?)\\s+(?P<Daemon>.*?)\\s+(?P<Content>.*?)$')
self.df_log    LineId             Time                                   Machine Daemon  \
0       1  "0","2013-03-10  00:00:00","master","dhcpd","DHCPDISCOVER   from   
1       2  "1","2013-03-10  00:00:00","master","dhcpd","DHCPDISCOVER   from   
2       3  "2","2013-03-10  00:00:00","master","dhcpd","DHCPDISCOVER   from   
3       4  "3","2013-03-10  00:00:00","master","dhcpd","DHCPDISCOVER   from   
4       5  "4","2013-03-10  00:00:00","master","dhcpd","DHCPDISCOVER   from   
5       6  "5","2013-03-10      00:00:00","oss10","multipathd","sdq:    tur   
6       7  "6","2013-03-10      00:00:00","oss11","multipathd","sdu:    tur   
7       8  "7","2013-03-10     00:00:00","oss11","multipathd","sdam:    tur   
8       9  "8","2013-03-10     00:00:00","oss11","multipathd","sdax:    tur   

