In [33]:
import copy
import hashlib
import re
from attr import dataclass
import pandas as pd
from logai.algorithms.algo_interfaces import ParsingAlgo


#IPLoM

@dataclass
class IPLoMParams:
    """Para class

    Attributes
    ----------
        maxEventLen : the length of the longest log/event, which is used in step 1 to split logs into partitions
            according to their length
        path : the path of the input file
        support : the support threshold to create a new partition, partitions which contains less than
            support logs will not go through step 2
        PST : Partition support ratio threshold
        CT : Cluster goodness threshold used in DetermineP1P2 in step3. If the columns with unique term more
            than CT, we skip step 3
    """
    maxEventLen:int = 200
    support:float = 0
    PST:float = 0
    CT:float = 0.35
    lowerBound:float = 0.25
    upperBound:float = 0.9
    rex:list = []


@dataclass
class Partition:
    """Wrap around the logs and the step number"""
    logLL:list = []
    stepNo:int = 0
    valid:bool = True
    numOfLogs:int = 0
    lenOfLogs:int = 0


class Event:
    def __init__(self, event_str):
        self.eventStr = event_str
        self.eventId = hashlib.md5(" ".join(event_str).encode("utf-8")).hexdigest()[0:8]
        self.eventCount = 0


class IPLoM(ParsingAlgo):
    def __init__(
            self,
            params=IPLoMParams,
            keep_params=True
    ):
        self.params = params
        self.partitionsL = []
        self.eventsL = []
        self.output = []
        self.keep_params = keep_params
        # Initialize some partitions which contain logs with different length
        for logLen in range(self.params.maxEventLen + 1):
            self.partitionsL.append(Partition(stepNo=1, numOfLogs=0, lenOfLogs=logLen))
        return

    def step1(self, loglines):

        lineCounts = 1
        for idx, line in loglines.iteritems():
            if not line.strip():
                continue
            if self.params.rex:
                for currentRex in self.params.rex:
                    line = re.sub(currentRex, " ", line)

            tokens = line.split(" ")
            if not tokens:
                tokens = [" "]

            tokens.append(str(lineCounts))
            lineCounts += 1

            self.partitionsL[len(tokens) - 1].logLL.append(tokens)
            self.partitionsL[len(tokens) - 1].numOfLogs += 1

        for partition in self.partitionsL:
            if partition.numOfLogs <= 0:
                partition.valid = False

            elif self.params.PST !=0 and 1.0 * partition.numOfLogs / len(line) < self.params.PST:
                for logL in partition.logLL:
                    self.partitionsL[0].logLL.append(logL)
                    self.partitionsL[0].numOfLogs +=1
                partition.valid = False

    def step2(self):

        for partition in self.partitionsL:

            if not partition.valid:
                continue

            if partition.numOfLogs <= self.params.support:
                continue

            # Avoid going through newly generated partitions
            if partition.stepNo == 2:
                break

            # For each column, create a set to hold the unique tokens in that column.
            # And finally, calculate the number of the unique tokens in each column
            uniqueTokensCountLS = []
            for columnIdx in range(partition.lenOfLogs):
                uniqueTokensCountLS.append(set())

            for logL in partition.logLL:
                for columnIdx in range(partition.lenOfLogs):
                    uniqueTokensCountLS[columnIdx].add(logL[columnIdx])

            # Find the column with minimum unique tokens
            minColumnIdx = 0
            minColumnCount = len(uniqueTokensCountLS[0])

            for columnIdx in range(partition.lenOfLogs):
                if minColumnCount > len(uniqueTokensCountLS[columnIdx]):
                    minColumnCount = len(uniqueTokensCountLS[columnIdx])
                    minColumnIdx = columnIdx

            # If there is one column with one unique term, do not split this partition
            if minColumnCount == 1:
                continue

            # From split-token to log list
            logDLL = {}
            for logL in partition.logLL:
                if logL[minColumnIdx] not in logDLL:
                    logDLL[logL[minColumnIdx]] = []
                logDLL[logL[minColumnIdx]].append(logL)

            for key in logDLL:
                if (
                    self.params.PST != 0
                    and 1.0 * len(logDLL[key]) / partition.numOfLogs < self.params.PST
                ):
                    self.partitionsL[0].logLL += logDLL[key]
                    self.partitionsL[0].numOfLogs += len(logDLL[key])
                else:
                    newPartition = Partition(
                        stepNo=2,
                        numOfLogs=len(logDLL[key]),
                        lenOfLogs=partition.lenOfLogs,
                    )
                    newPartition.logLL = logDLL[key]
                    self.partitionsL.append(newPartition)

            partition.valid = False
        for partition in self.partitionsL:

            if not partition.valid:
                continue

            if partition.numOfLogs <= self.params.support:
                continue

            # Avoid going through newly generated partitions
            if partition.stepNo == 2:
                break

            # For each column, create a set to hold the unique tokens in that column.
            # And finally, calculate the number of the unique tokens in each column
            uniqueTokensCountLS = []
            for columnIdx in range(partition.lenOfLogs):
                uniqueTokensCountLS.append(set())

            for logL in partition.logLL:
                for columnIdx in range(partition.lenOfLogs):
                    uniqueTokensCountLS[columnIdx].add(logL[columnIdx])

            # Find the column with minimum unique tokens
            minColumnIdx = 0
            minColumnCount = len(uniqueTokensCountLS[0])

            for columnIdx in range(partition.lenOfLogs):
                if minColumnCount > len(uniqueTokensCountLS[columnIdx]):
                    minColumnCount = len(uniqueTokensCountLS[columnIdx])
                    minColumnIdx = columnIdx

            # If there is one column with one unique term, do not split this partition
            if minColumnCount == 1:
                continue

            # From split-token to log list
            logDLL = {}
            for logL in partition.logLL:
                if logL[minColumnIdx] not in logDLL:
                    logDLL[logL[minColumnIdx]] = []
                logDLL[logL[minColumnIdx]].append(logL)

            for key in logDLL:
                if (
                    self.params.PST != 0
                    and 1.0 * len(logDLL[key]) / partition.numOfLogs < self.params.PST
                ):
                    self.partitionsL[0].logLL += logDLL[key]
                    self.partitionsL[0].numOfLogs += len(logDLL[key])
                else:
                    newPartition = Partition(
                        stepNo=2,
                        numOfLogs=len(logDLL[key]),
                        lenOfLogs=partition.lenOfLogs,
                    )
                    newPartition.logLL = logDLL[key]
                    self.partitionsL.append(newPartition)

            partition.valid = False

    def step3(self):

        for partition in self.partitionsL:

            if not partition.valid:
                continue

            if partition.stepNo == 3:
                break

            # Find two columns that my cause split in this step
            p1, p2 = self.DetermineP1P2(partition)

            if p1 == -1 or p2 == -1:
                continue

            try:

                p1Set = set()
                p2Set = set()
                mapRelation1DS = {}
                mapRelation2DS = {}

                # Construct token sets for p1 and p2, dictionary to record the mapping relations between p1 and p2
                for logL in partition.logLL:
                    p1Set.add(logL[p1])
                    p2Set.add(logL[p2])

                    if logL[p1] == logL[p2]:
                        print("Warning: p1 may be equal to p2")

                    if logL[p1] not in mapRelation1DS:
                        mapRelation1DS[logL[p1]] = set()
                    mapRelation1DS[logL[p1]].add(logL[p2])

                    if logL[p2] not in mapRelation2DS:
                        mapRelation2DS[logL[p2]] = set()
                    mapRelation2DS[logL[p2]].add(logL[p1])

                # Construct sets to record the tokens in 1-1, 1-M, M-1 relationships, the left-tokens in p1Set & p2Set
                # are in M-M relationships
                oneToOneS = set()
                oneToMP1D = {}
                oneToMP2D = {}

                # select 1-1 and 1-M relationships
                for p1Token in p1Set:
                    if len(mapRelation1DS[p1Token]) == 1:
                        if len(mapRelation2DS[list(mapRelation1DS[p1Token])[0]]) == 1:
                            oneToOneS.add(p1Token)

                    else:
                        isOneToM = True

                        for p2Token in mapRelation1DS[p1Token]:
                            if len(mapRelation2DS[p2Token]) != 1:
                                isOneToM = False
                                break
                        if isOneToM:
                            oneToMP1D[p1Token] = 0

                # delete the tokens which are picked to 1-1 and 1-M relationships from p1Set, so that the left are M-M
                for deleteToken in oneToOneS:
                    p1Set.remove(deleteToken)
                    p2Set.remove(list(mapRelation1DS[deleteToken])[0])

                for deleteToken in oneToMP1D:
                    for deleteTokenP2 in mapRelation1DS[deleteToken]:
                        p2Set.remove(deleteTokenP2)
                    p1Set.remove(deleteToken)

                # select M-1 relationships
                for p2Token in p2Set:
                    if len(mapRelation2DS[p2Token]) != 1:
                        isOneToM = True
                        for p1Token in mapRelation2DS[p2Token]:
                            if len(mapRelation1DS[p1Token]) != 1:
                                isOneToM = False
                                break
                        if isOneToM:
                            oneToMP2D[p2Token] = 0

                # delete the tokens which are picked to M-1 relationships from p2Set, so that the left are M-M
                for deleteToken in oneToMP2D:
                    p2Set.remove(deleteToken)
                    for deleteTokenP1 in mapRelation2DS[deleteToken]:
                        p1Set.remove(deleteTokenP1)

                # calculate the #Lines_that_match_S
                for logL in partition.logLL:
                    if logL[p1] in oneToMP1D:
                        oneToMP1D[logL[p1]] += 1

                    if logL[p2] in oneToMP2D:
                        oneToMP2D[logL[p2]] += 1

            except KeyError as er:
                print(er)
                print("error: " + str(p1) + "\t" + str(p2))

            newPartitionsD = {}
            if partition.stepNo == 2:
                newPartitionsD["dumpKeyforMMrelationInStep2__"] = Partition(
                    stepNo=3, numOfLogs=0, lenOfLogs=partition.lenOfLogs
                )
            # Split partition
            for logL in partition.logLL:
                # If is 1-1
                if logL[p1] in oneToOneS:
                    if logL[p1] not in newPartitionsD:
                        newPartitionsD[logL[p1]] = Partition(
                            stepNo=3, numOfLogs=0, lenOfLogs=partition.lenOfLogs
                        )
                    newPartitionsD[logL[p1]].logLL.append(logL)
                    newPartitionsD[logL[p1]].numOfLogs += 1

                # This part can be improved. The split_rank can be calculated once.
                # If is 1-M
                elif logL[p1] in oneToMP1D:
                    split_rank = self.Get_Rank_Posistion(
                        len(mapRelation1DS[logL[p1]]), oneToMP1D[logL[p1]], True
                    )
                    if split_rank == 1:
                        if logL[p1] not in newPartitionsD:
                            newPartitionsD[logL[p1]] = Partition(
                                stepNo=3, numOfLogs=0, lenOfLogs=partition.lenOfLogs
                            )
                        newPartitionsD[logL[p1]].logLL.append(logL)
                        newPartitionsD[logL[p1]].numOfLogs += 1
                    else:
                        if logL[p2] not in newPartitionsD:
                            newPartitionsD[logL[p2]] = Partition(
                                stepNo=3, numOfLogs=0, lenOfLogs=partition.lenOfLogs
                            )
                        newPartitionsD[logL[p2]].logLL.append(logL)
                        newPartitionsD[logL[p2]].numOfLogs += 1

                # If is M-1
                elif logL[p2] in oneToMP2D:
                    split_rank = self.Get_Rank_Posistion(
                        len(mapRelation2DS[logL[p2]]), oneToMP2D[logL[p2]], False
                    )
                    if split_rank == 1:
                        if logL[p1] not in newPartitionsD:
                            newPartitionsD[logL[p1]] = Partition(
                                stepNo=3, numOfLogs=0, lenOfLogs=partition.lenOfLogs
                            )
                        newPartitionsD[logL[p1]].logLL.append(logL)
                        newPartitionsD[logL[p1]].numOfLogs += 1
                    else:
                        if logL[p2] not in newPartitionsD:
                            newPartitionsD[logL[p2]] = Partition(
                                stepNo=3, numOfLogs=0, lenOfLogs=partition.lenOfLogs
                            )
                        newPartitionsD[logL[p2]].logLL.append(logL)
                        newPartitionsD[logL[p2]].numOfLogs += 1

                # M-M
                else:
                    if partition.stepNo == 2:
                        newPartitionsD["dumpKeyforMMrelationInStep2__"].logLL.append(
                            logL
                        )
                        newPartitionsD["dumpKeyforMMrelationInStep2__"].numOfLogs += 1
                    else:
                        if len(p1Set) < len(p2Set):
                            if logL[p1] not in newPartitionsD:
                                newPartitionsD[logL[p1]] = Partition(
                                    stepNo=3, numOfLogs=0, lenOfLogs=partition.lenOfLogs
                                )
                            newPartitionsD[logL[p1]].logLL.append(logL)
                            newPartitionsD[logL[p1]].numOfLogs += 1
                        else:
                            if logL[p2] not in newPartitionsD:
                                newPartitionsD[logL[p2]] = Partition(
                                    stepNo=3, numOfLogs=0, lenOfLogs=partition.lenOfLogs
                                )
                            newPartitionsD[logL[p2]].logLL.append(logL)
                            newPartitionsD[logL[p2]].numOfLogs += 1

            if (
                "dumpKeyforMMrelationInStep2__" in newPartitionsD
                and newPartitionsD["dumpKeyforMMrelationInStep2__"].numOfLogs == 0
            ):
                newPartitionsD["dumpKeyforMMrelationInStep2__"].valid = False
            # Add all the new partitions to collection
            for key in newPartitionsD:
                if (
                    self.params.PST != 0
                    and 1.0 * newPartitionsD[key].numOfLogs / partition.numOfLogs
                    < self.params.PST
                ):
                    self.partitionsL[0].logLL += newPartitionsD[key].logLL
                    self.partitionsL[0].numOfLogs += newPartitionsD[key].numOfLogs
                else:
                    self.partitionsL.append(newPartitionsD[key])

            partition.valid = False

    def step4(self):
        self.partitionsL[0].valid = False
        if self.params.PST == 0 and self.partitionsL[0].numOfLogs != 0:
            event = Event(["Outlier"])
            event.eventCount = self.partitionsL[0].numOfLogs
            self.eventsL.append(event)

            for logL in self.partitionsL[0].logLL:
                logL.append(str(event.eventId))

        for partition in self.partitionsL:
            if not partition.valid:
                continue

            if partition.numOfLogs == 0:
                print(str(partition.stepNo) + "\t")

            uniqueTokensCountLS = []
            for columnIdx in range(partition.lenOfLogs):
                uniqueTokensCountLS.append(set())

            for logL in partition.logLL:
                for columnIdx in range(partition.lenOfLogs):
                    uniqueTokensCountLS[columnIdx].add(logL[columnIdx])

            e = copy.deepcopy(partition.logLL[0])[: partition.lenOfLogs]

            for columnIdx in range(partition.lenOfLogs):
                if len(uniqueTokensCountLS[columnIdx]) == 1:
                    continue
                else:
                    e[columnIdx] = "<*>"

            event = Event(e)
            event.eventCount = partition.numOfLogs

            self.eventsL.append(event)

            for logL in partition.logLL:
                logL.append(str(event.eventId))

    def get_out_put(self):
        output = []
        if self.params.PST == 0 and self.partitionsL[0].numOfLogs != 0:
            for logL in self.partitionsL[0].logLL:
                output.append(logL[-2:] + logL[:-2])
        for partition in self.partitionsL:
            if not partition.valid:
                continue
            for logL in partition.logLL:
                output.append(logL[-2:] + logL[:-2])
        return output

    def fit(self, loglines: pd.Series):
        self.step1(loglines)
        self.step2()
        self.step3()
        self.step4()


    def parse(self, loglines: pd.Series) -> pd.DataFrame:
        self.fit(loglines)
        return self.get_out_put()


In [34]:
params = IPLoMParams()

parser = IPLoM(params)

In [35]:
df = pd.read_pickle("/Users/qcheng/workspace/gitsoma/logai/tests/logai/test_data/default_logrecord_body")

In [36]:
loglines = df.logline

In [37]:
res = parser.parse(loglines)

IndexError: list index out of range

In [None]:
len(parser.partitionsL)


In [41]:
# identify timestamp
import re
from datetime import datetime

logline = "20171223-22:15:29:606|Step_LSC|30002312|onStan."

match = re.search(r'\d{4}-\d{2}-\d{2}', logline)
# datetime = datetime.strptime(match.group(), '%Y-%m-%d')


# def identify_datetime(str):
#     match = re.search(r'\d{4}-\d{2}-\d{2}', text)
#     datetime = datetime.strptime(match.group(), '%Y-%m-%d')

AttributeError: 'NoneType' object has no attribute 'group'

In [44]:
import dateutil.parser as dparser

for l in logline.split("|"):
    print(dparser.parse(l, fuzzy=True))



ParserError: Unknown string format: 20171223-22:15:29:606