In [None]:
import numpy as np
import pandas as pd

In [None]:
def stat_analysis(series, m=5, n=1.5):
    '''
    Performs a statistical method to detect anomalies in NMDB data.
    Sweeps a window across the input time series. Finds the mean and standard deviation of points in the window at
    every iteration. If a point entering the window is a certain number of standard deviations above the mean, then
    the point is remembered. If three such points occur in a row, the final point in the triplet is marked as an
    "anomaly" and its position is returned in a list of all such anomalies in the input series.
    
    Arguments
    series: The input time series from NMDB. List-like. Should be equivalent to a single column or section of a
    column of a dataframe produced by NMDB_access_data.ipynb.
    m: The size of the scanning window. Must be of type int.
    The smaller the window, the more likely this function is to classify a point as anomalous (even if it's not
    really anomalous). The larger the window, the less likely this function is to misclassify a point as anomalous,
    but it will also be less likely to classify points as anomalous even when they are really anomalous. In other
    words, small m risks Type 1 statistical errors, whereas large m risks Type 2 errors.
    A good value of m is probably around 4 to 50 for an input series of length 44640.
    n: The number of standard deviations above the mean of the window a point must be to be remembered for later
    possible classification as anomalous. The smaller n is, the more likely this function is to classify a point
    as anomalous (even if it's not really anomalous). The larger n is, the less likely this function is to
    misclassify a point as anomalous, but it will also be less likely to classify points as anomalous even when they
    are really anomalous. In other words, small n risks Type 1 statistical errors, whereas large n risks Type 2 errors.
    A good value of n is probably around 1.5 to 4.
    
    Returns
    anomaly_location_list: A list of all time stamps for which the corresponding point's count/s value and the two
    preceding points' counts/s values exceeded the mean and standard deviation of the window. In other words, a list
    of timestamps corresponding to significantly anomalous regions of the input time series.
    '''
    #Initialize a list to contain counts/s values of successive points, then fill it with the first m values
    window = []
    for point in range(m):
        window.append(series[point])
    
    #Initialize a list to store timestamps of significantly anomalous points
    anomaly_location_list = []
    #Initialize an array to store "memories" of whether a not a point exceeded the mean+n*std threshold
    exceeded_thresh_list = np.zeros(m)
    
    for point in range(m, len(series)):
        mean = np.mean(window)
        std = np.std(window)
        anomaly_threshold = mean + n*std
        
        #Remember whether or not a point exceeded the threshold
        if series[point] >= anomaly_threshold:
            exceeded_thresh_list = np.append(exceeded_thresh_list, 1)
        elif series[point] < anomaly_threshold:
            exceeded_thresh_list = np.append(exceeded_thresh_list, 0)
        
        #If three successive points exceeded the threshold, mark it as significantly anomalous
        if exceeded_thresh_list[point] == 1 and exceeded_thresh_list[point-1] == 1 and exceeded_thresh_list[point-2] == 1:
            anomaly_location_list.append(point)
        
        #Slide the window left to right by one step each time this loop occurs
        window.pop(0)
        window.append(series[point])        
    
    return(anomaly_location_list)

In [None]:
stat_analysis('Nov_2003_NMDB.txt', m=10, n=2.5)