In [None]:
import json
with open('properties.json', 'r') as f:
    properties = json.load(f)
properties

In [None]:
brokers = '{}-cp-kafka:9092'.format(properties['pnda-helm-release'])
topic = properties['topic']

In [None]:
import pandas as pd

In [None]:
def decode_openmetrics(input: str) -> dict:
    tokenized = input.split()
    source = tokenized[0]
    value = float(tokenized[1])
    if len(tokenized)>2:
        timestamp = float(tokenized[2])
    else:
        timestamp = None
    source_tokenized = source.split("{")
    metricname = source_tokenized[0]
    if len(source_tokenized)>1:
        tags = source_tokenized[1].split("}")[0].split(",")
    else:
        tags = []
    return {'source':source,'metricname': metricname,'tags': tags, 'timestamp':timestamp, 'value':value}

In [None]:
class InterArrivalTransformer:
    def __init__(self):
        self.latest_arrivaltime_dict = dict()

    # NOTE: this is implemented for streaming, so it transform one event at a time,
    # i.e, the function input is a pandas dataframe with a single row. If multiple rows are provided
    # they are iterated with iterrows(). It should be optimized if the function transform full dataframes. 
    def transform(self, input: pd.DataFrame) -> pd.DataFrame:
        interarrival_rows = []
        for index,row in input.iterrows():
          
            source = row['source']
            timestamp = row['timestamp']
            latest_timestamp = self.latest_arrivaltime_dict.get(source, None)
            self.latest_arrivaltime_dict[source] = timestamp
            if latest_timestamp:
                interarrival_rows.append([source, timestamp, (timestamp - latest_timestamp)])   
        return pd.DataFrame(interarrival_rows, 
                 columns= ['source','timestamp','value'])

## Loading anomaly detection model


In [None]:
#import mlflow.pyfunc
import pandas as pd
import numpy as np
import math

normal_quantiles_zp = {
    0.8: 1.281551565545,
    0.9: 1.644853626951,
    0.95: 1.959963984540,
    0.98: 2.326347874041,
    0.99: 2.575829303549,
    0.995: 2.807033768344,
    0.998: 3.090232306168,
    0.999: 3.290526731492,
    0.9999: 3.890591886413,
    0.99999: 4.417173413469,
    0.999999: 4.891638475699,
    0.9999999: 5.326723886384,
    0.99999999: 5.730728868236,
    0.999999999: 6.109410204869,
}

# Define the model class
#class MultisourceNormalDetector(mlflow.pyfunc.PythonModel):
class MultisourceNormalDetector:
    def __init__(self):
        self.stats = {}

    
    def fit(self, input: pd.DataFrame):
        # Non-incremental avg-var computation. Stats are reset
        self.reset()
        agg_df = input.groupby(['source'])['value'].agg(['sum','count','var'])
        for source,row in agg_df.iterrows():
            stats = {'sum': row['sum'],
                        'count': row['count'],
                        'avg': row['sum']/row['count'],
                        'var': row['var']}
            self.stats[source] = stats

    def incremental_fit(self, input: pd.DataFrame):
        # incremental avg-var computation
        # https://math.stackexchange.com/questions/102978/incremental-computation-of-standard-deviation
        for index,row in input.iterrows():
            stats = self.stats.get(row['source'],{'sum':0,'count':0, 'var':0})
            sum = stats['sum'] + row['value']
            count = stats['count'] + 1
            if count > 1:
                stats['var'] = (count-2)/(count-1) * stats['var'] + 1/count * math.pow((row['value'] - stats['avg']),2)
            else:
                stats['var'] = stats['var'] + math.pow(row['value']- sum/count,2)
            stats['avg'] = sum/count
            stats['sum'] = sum
            stats['count'] = count
            self.stats[row['source']] = stats

    def reset(self,source=None):
        if source:
            self.stats[source] = {}
        else:
            self.stats = {}


    def predict(self, context, model_input: pd.DataFrame, p=0.99999) -> pd.DataFrame:
        #NOTE can be optimized using pandas to perform this condition check
        output = []
        for index,row in model_input.iterrows():  
            source = row['source']
            value = row['value']
            timestamp = row['timestamp']
            if source not in self.stats:
                output_row = [source, str(timestamp), value, True, 'unknown source']
            else:
                stats = self.stats[source]
                zp = normal_quantiles_zp[p]
                diff = zp*math.sqrt(stats['var'])
                min = stats['avg'] - diff
                max = stats['avg'] + diff
                if min <= value <= max:
                    output_row = [source, str(timestamp), value, False, None]
                else:
                    output_row = [source, str(timestamp), value, True, 'value out of limits ({},{})'.format(min,max)]
            output.append(output_row)
        return pd.DataFrame(output, 
                 columns= ['source','timestamp','value','anomaly','anomaly_type'])


# Model Fitting

In [None]:
model = MultisourceNormalDetector()

In [None]:
from kafka import KafkaConsumer

def run_fit(brokers, topic, model, interarrival_transformer):
    try:
        consumer = KafkaConsumer(topic, bootstrap_servers=brokers.split())    
        for msg in consumer:
            openmetric_sample=decode_openmetrics(msg.value.decode('utf-8'))
            if openmetric_sample['timestamp'] == None:
                openmetric_sample['timestamp'] = msg.timestamp
            openmetric_df = pd.DataFrame.from_dict(openmetric_sample)
            interrarival_df = interarrival_transformer.transform(openmetric_df)
            model.incremental_fit(interrarival_df)
    except KeyboardInterrupt:
        print("Model Fitting Stage interrupted.")
        print("model:       avg           var        count  source")
        for k,v in model.stats.items():
            print("      {:10.2f}    {:10.2f} {:10.2f}  {}".format(v['avg'],v['var'],v['count'], k ))

fit_interarrival_transformer = InterArrivalTransformer()
run_fit(brokers, topic, model, fit_interarrival_transformer)


# Model Prediction

In [None]:
#Getting anomalies from source

from kafka import KafkaConsumer

# Predict Model
def run_predict( brokers, topic, model, interarrival_transformer):
    consumer = KafkaConsumer(topic, bootstrap_servers=brokers.split())    
    for msg in consumer:
        openmetric_sample=decode_openmetrics(msg.value.decode('utf-8'))
        if openmetric_sample['timestamp'] == None:
            openmetric_sample['timestamp'] = msg.timestamp
        openmetric_df = pd.DataFrame.from_dict(openmetric_sample)
        anomalies_df = model.predict(context=[], model_input=interarrival_transformer.transform(openmetric_df), p=0.999999)
        if len(anomalies_df[anomalies_df.anomaly == True]) > 0:
            print(anomalies_df[anomalies_df.anomaly == True])

predict_interarrival_transformer = InterArrivalTransformer()
run_predict(brokers, topic, model, predict_interarrival_transformer)
