In [None]:
#import mlflow.pyfunc
import pandas as pd
import numpy as np
import math

normal_quantiles_zp = {
    0.8: 1.281551565545,
    0.9: 1.644853626951,
    0.95: 1.959963984540,
    0.98: 2.326347874041,
    0.99: 2.575829303549,
    0.995: 2.807033768344,
    0.998: 3.090232306168,
    0.999: 3.290526731492,
    0.9999: 3.890591886413,
    0.99999: 4.417173413469,
    0.999999: 4.891638475699,
    0.9999999: 5.326723886384,
    0.99999999: 5.730728868236,
    0.999999999: 6.109410204869,
}

# Define the model class
#class MultisourceNormalDetector(mlflow.pyfunc.PythonModel):
class MultisourceNormalDetector:
    def __init__(self):
        self.stats = {}

    
    def fit(self, input: pd.DataFrame):
        # Non-incremental avg-var computation. Stats are reset
        self.reset()
        agg_df = input.groupby(['source'])['value'].agg(['sum','count','var'])
        for source,row in agg_df.iterrows():
            stats = {'sum': row['sum'],
                        'count': row['count'],
                        'avg': row['sum']/row['count'],
                        'var': row['var']}
            self.stats[source] = stats

    def incremental_fit(self, input: pd.DataFrame):
        # incremental avg-var computation
        # https://math.stackexchange.com/questions/102978/incremental-computation-of-standard-deviation
        for index,row in input.iterrows():
            stats = self.stats.get(row['source'],{'sum':0,'count':0, 'var':0})
            sum = stats['sum'] + row['value']
            count = stats['count'] + 1
            if count > 1:
                stats['var'] = (count-2)/(count-1) * stats['var'] + 1/count * math.pow((row['value'] - stats['avg']),2)
            else:
                stats['var'] = stats['var'] + math.pow(row['value']- sum/count,2)
            stats['avg'] = sum/count
            stats['sum'] = sum
            stats['count'] = count
            self.stats[row['source']] = stats

    def reset(self,source=None):
        if source:
            self.stats[source] = {}
        else:
            self.stats = {}


    def predict(self, context, model_input: pd.DataFrame, p=0.99999) -> pd.DataFrame:
        #NOTE can be optimized using pandas to perform this condition check
        output = []
        for index,row in model_input.iterrows():  
            source = row['source']
            value = row['value']
            timestamp = row['timestamp']
            if source not in self.stats:
                output_row = [source, str(timestamp), value, True, 'unknown source']
            else:
                stats = self.stats[source]
                zp = normal_quantiles_zp[p]
                diff = zp*math.sqrt(stats['var'])
                min = stats['avg'] - diff
                max = stats['avg'] + diff
                if min <= value <= max:
                    output_row = [source, str(timestamp), value, False, None]
                else:
                    output_row = [source, str(timestamp), value, True, 'value out of limits ({},{})'.format(min,max)]
            output.append(output_row)
        return pd.DataFrame(output, 
                 columns= ['source','timestamp','value','anomaly','anomaly_type'])
        for index,row in model_input.iterrows():  
            source = row['source']
            value = row['value']
            timestamp = row['timestamp']
            if source not in self.stats:
                output_row = [source, str(timestamp), value, True, 'unknown source']
            else:
                stats = self.stats[source]
                zp = normal_quantiles_zp[p]
                diff = zp*math.sqrt(stats['variance'])
                min = stats['avg'] - diff
                max = stats['avg'] + diff
                if min <= value <= max:
                    output_row = [source, str(timestamp), value, False, None]
                else:
                    output_row = [source, str(timestamp), value, True, 'value out of limits ({},{})'.format(min,max)]
            output.append(output_row)
        return pd.DataFrame(output, 
                 columns= ['source','timestamp','value','anomaly','anomaly_type'])

## Non-incremental fit

In [None]:
interarrival_df = pd.read_csv('normal_interarrival_samples.csv')

model = MultisourceNormalDetector()
model.fit(interarrival_df)

In [None]:
# Test
input_df = interarrival_df
output_df = model.predict(context=None, model_input=input_df)
print(output_df[output_df.anomaly == True])


In [None]:
# Test
input_df = pd.read_csv('normal_interarrival_samples_with_anomalies.csv')
output_df = model.predict(context=None, model_input=input_df)
print(output_df[output_df.anomaly == True])


## Incremental fit

In [None]:
interarrival_df = pd.read_csv('normal_interarrival_samples.csv')
model.reset()
model = MultisourceNormalDetector()
model.incremental_fit(interarrival_df)

In [None]:
# Test
input_df = interarrival_df
output_df = model.predict(context=None, model_input=input_df)
print(output_df[output_df.anomaly == True])


In [None]:
# Test
input_df = pd.read_csv('normal_interarrival_samples_with_anomalies.csv')
output_df = model.predict(context=None, model_input=input_df)
print(output_df[output_df.anomaly == True])
