# Introduction
Anomaly detection has applications in many fields, such as system health monitoring, fraud detection, and intrusion detection.

![](https://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F3595464%2F4088133a20318f4e47e1e2d738509d12%2F__results___5_0.png?generation=1590869249365044&alt=media)

## Using Luminol
Detecting Outliers and Change Points from Time Series

In [None]:
!pip install luminol
import luminol

from luminol import anomaly_detector,correlator

from luminol.anomaly_detector import AnomalyDetector
from luminol.correlator import Correlator

In [None]:
import matplotlib.pyplot as plt# Standardize/scale the dataset and apply PCA
from sklearn.decomposition import PCA
def Score_data(pred, real):
    # computing errors
    errors = np.abs(pred - real).flatten()
    # estimation
    mean = sum(errors)/len(errors)
    cov = 0
    for e in errors:
        cov += (e - mean)**2
    cov /= len(errors)

    print('mean : ', mean)
    print('cov : ', cov)
    return errors, cov, mean

# calculate Mahalanobis distance
def Mahala_distantce(x,mean,cov):
    return (x - mean)**2 / cov


def scale(A):
    return (A-np.min(A))/(np.max(A) - np.min(A))


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## see this: https://github.com/waico/SKAB/
def evaluating_change_point(true, prediction, metric='nab', numenta_time=None):
    """
    true - both:
                list of pandas Series with binary int labels
                pandas Series with binary int labels
    prediction - both:
                      list of pandas Series with binary int labels
                      pandas Series with binary int labels
    metric: 'nab', 'binary' (FAR, MAR), 'average_delay'
                
    """
    
    def binary(true, prediction):      
        """
        true - true binary series with 1 as anomalies
        prediction - trupredicted binary series with 1 as anomalies
        """
        def single_binary(true,prediction):
            true_ = true == 1 
            prediction_ = prediction == 1
            TP = (true_ & prediction_).sum()
            TN = (~true_ & ~prediction_).sum()
            FP = (~true_ & prediction_).sum()
            FN = (true_ & ~prediction_).sum()
            return TP,TN,FP,FN
            
        if type(true) != type(list()):
            TP,TN,FP,FN = single_binary(true,prediction)
        else:
            TP,TN,FP,FN = 0,0,0,0
            for i in range(len(true)):
                TP_,TN_,FP_,FN_ = single_binary(true[i],prediction[i])
                TP,TN,FP,FN = TP+TP_,TN+TN_,FP+FP_,FN+FN_       
    
        f1 = round(TP/(TP+(FN+FP)/2), 2)
        print(f'False Alarm Rate {round(FP/(FP+TN)*100,2)} %' )
        print(f'Missing Alarm Rate {round(FN/(FN+TP)*100,2)} %')
        print(f'F1 metric {f1}')
        return f1
    
    def average_delay(detecting_boundaries, prediction):
        
        def single_average_delay(detecting_boundaries, prediction):
            missing = 0
            detectHistory = []
            for couple in detecting_boundaries:
                t1 = couple[0]
                t2 = couple[1]
                if prediction[t1:t2].sum()==0:
                    missing+=1
                else:
                    detectHistory.append(prediction[prediction ==1][t1:t2].index[0]-t1)
            return missing, detectHistory
            
        
        if type(prediction) != type(list()):
            missing, detectHistory = single_average_delay(detecting_boundaries, prediction)
        else:
            missing, detectHistory = 0, []
            for i in range(len(prediction)):
                missing_, detectHistory_ = single_average_delay(detecting_boundaries[i], prediction[i])
                missing, detectHistory = missing+missing_, detectHistory+detectHistory_

        add = pd.Series(detectHistory).mean()
        print('Average delay', add)
        print(f'A number of missed CPs = {missing}')
        return add
    
    def evaluate_nab(detecting_boundaries, prediction, table_of_coef=None):
        """
        Scoring labeled time series by means of
        Numenta Anomaly Benchmark methodics
        Parameters
        ----------
        detecting_boundaries: list of list of two float values
            The list of lists of left and right boundary indices
            for scoring results of labeling
        prediction: pd.Series with timestamp indices, in which 1 
            is change point, and 0 in other case. 
        table_of_coef: pandas array (3x4) of float values
            Table of coefficients for NAB score function
            indeces: 'Standart','LowFP','LowFN'
            columns:'A_tp','A_fp','A_tn','A_fn'
        Returns
        -------
        Scores: numpy array, shape of 3, float
            Score for 'Standart','LowFP','LowFN' profile 
        Scores_null: numpy array, shape 3, float
            Null score for 'Standart','LowFP','LowFN' profile             
        Scores_perfect: numpy array, shape 3, float
            Perfect Score for 'Standart','LowFP','LowFN' profile  
        """
        def single_evaluate_nab(detecting_boundaries, prediction, table_of_coef=None, name_of_dataset=None):
            if table_of_coef is None:
                table_of_coef = pd.DataFrame([[1.0,-0.11,1.0,-1.0],
                                     [1.0,-0.22,1.0,-1.0],
                                      [1.0,-0.11,1.0,-2.0]])
                table_of_coef.index = ['Standart','LowFP','LowFN']
                table_of_coef.index.name = "Metric"
                table_of_coef.columns = ['A_tp','A_fp','A_tn','A_fn']

            alist = detecting_boundaries.copy()
            prediction = prediction.copy()

            Scores, Scores_perfect, Scores_null=[], [], []
            for profile in ['Standart', 'LowFP', 'LowFN']:       
                A_tp = table_of_coef['A_tp'][profile]
                A_fp = table_of_coef['A_fp'][profile]
                A_fn = table_of_coef['A_fn'][profile]
                def sigm_scale(y, A_tp, A_fp, window=1):
                    return (A_tp-A_fp)*(1/(1+np.exp(5*y/window))) + A_fp

                #First part
                score = 0
                if len(alist)>0:
                    score += prediction[:alist[0][0]].sum()*A_fp
                else:
                    score += prediction.sum()*A_fp
                #second part
                for i in range(len(alist)):
                    if i<=len(alist)-2:
                        win_space = prediction[alist[i][0]:alist[i+1][0]].copy()
                    else:
                        win_space = prediction[alist[i][0]:].copy()
                    win_fault = prediction[alist[i][0]:alist[i][1]]
                    slow_width = int(len(win_fault)/4)

                    if len(win_fault) + slow_width >= len(win_space):
                        print(f'Intersection of the windows of too wide widths for dataset {name_of_dataset}')
                        win_fault_slow = win_fault.copy()
                    else:
                        win_fault_slow= win_space[:len(win_fault)  +  slow_width]

                    win_fp = win_space[-len(win_fault_slow):]

                    if win_fault_slow.sum() == 0:
                        score+=A_fn
                    else:
                        #to get the first index
                        tr = pd.Series(win_fault_slow.values,index = range(-len(win_fault), len(win_fault_slow)-len(win_fault)))
                        tr_values= tr[tr==1].index[0]
                        tr_score = sigm_scale(tr_values, A_tp,A_fp,slow_width)
                        score += tr_score
                        score += win_fp.sum()*A_fp
                Scores.append(score)
                Scores_perfect.append(len(alist)*A_tp)
                Scores_null.append(len(alist)*A_fn)
            return np.array([np.array(Scores),np.array(Scores_null), np.array(Scores_perfect)])
       #======      
        if type(prediction) != type(list()):
            matrix = single_evaluate_nab(detecting_boundaries, prediction, table_of_coef=table_of_coef)
        else:
            matrix = np.zeros((3,3))
            for i in range(len(prediction)):
                matrix_ = single_evaluate_nab(detecting_boundaries[i], prediction[i], table_of_coef=table_of_coef,name_of_dataset=i)
                matrix = matrix + matrix_      
                
        results = {}
        desc = ['Standart', 'LowFP', 'LowFN'] 
        for t, profile_name in enumerate(desc):
            results[profile_name] = round(100*(matrix[0,t]-matrix[1,t])/(matrix[2,t]-matrix[1,t]), 2)
            print(profile_name,' - ', results[profile_name])
        
        return results
            
            
    #=========================================================================
    if type(true) != type(list()):
        true_items = true[true==1].index
    else:
        true_items = [true[i][true[i]==1].index for i in range(len(true))]
        

    if not metric=='binary':
        def single_detecting_boundaries(true, numenta_time, true_items):
            detecting_boundaries=[]
            td = pd.Timedelta(numenta_time) if numenta_time is not None else pd.Timedelta((true.index[-1]-true.index[0])/len(true_items))  
            for val in true_items:
                detecting_boundaries.append([val, val + td])
            return detecting_boundaries
        
        if type(true) != type(list()):
            detecting_boundaries = single_detecting_boundaries(true=true, numenta_time=numenta_time, true_items=true_items)
        else:
            detecting_boundaries=[]
            for i in range(len(true)):
                detecting_boundaries.append(single_detecting_boundaries(true=true[i], numenta_time=numenta_time, true_items=true_items[i]))

    if metric== 'nab':
        return evaluate_nab(detecting_boundaries, prediction)
    elif metric=='average_delay':
        return average_delay(detecting_boundaries, prediction)
    elif metric== 'binary':
        return binary(true, prediction)

In [None]:
# benchmark files checking
all_files=[]
import os
for root, dirs, files in os.walk("../input/skoltech-anomaly-benchmark-skab/SKAB"):
    for file in files:
        if file.endswith(".csv"):
             all_files.append(os.path.join(root, file))

In [None]:
# datasets with anomalies loading
list_of_df = [pd.read_csv(file, 
                          sep=';', 
                          index_col='datetime', 
                          parse_dates=True) for file in all_files if 'anomaly-free' not in file]
# anomaly-free df loading
anomaly_free_df = pd.read_csv([file for file in all_files if 'anomaly-free' in file][0], 
                            sep=';', 
                            index_col='datetime', 
                            parse_dates=True)

### Data description and visualization

In [None]:
# dataset characteristics printing
print(f'A number of datasets in the SkAB v1.0: {len(list_of_df)}\n')
print(f'Shape of the random dataset: {list_of_df[0].shape}\n')
n_cp = sum([len(df[df.changepoint==1.]) for df in list_of_df])
n_outlier = sum([len(df[df.anomaly==1.]) for df in list_of_df])
print(f'A number of changepoints in the SkAB v1.0: {n_cp}\n')
print(f'A number of outliers in the SkAB v1.0: {n_outlier}\n')
print(f'Head of the random dataset:')
display(list_of_df[0].head())

In [None]:
# random dataset visualizing
list_of_df[0].plot(figsize=(12,6))
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Signals')
plt.show()

###  Labels

In [None]:
# plotting the labels both for outlier and changepoint detection problems
list_of_df[0].anomaly.plot(figsize=(12,3))
list_of_df[0].changepoint.plot()
plt.legend()
plt.show()

### Method applying

In [None]:
def scoreLuminolALLData(ts_dict):    
    data = np.array(ts_dict)
    ts_s = pd.Series(data)
    ts_dict = ts_s.to_dict()


    detector = anomaly_detector.AnomalyDetector(ts_dict)
    score = detector.get_all_scores()
    score_v = []
    for timestamp, value in score.iteritems():
        score_v.append(value)
#         print(timestamp, value)
    return score_v

In [None]:
# inference
predicted_outlier, predicted_cp = [], []
for df in list_of_df:
    X_train = df.drop(['Accelerometer1RMS','Accelerometer2RMS', 'Current', 'Temperature', 'Thermocouple', 'Voltage', 'Pressure', 'anomaly','changepoint'], axis=1)
#     pca = PCA(n_components=1)
#     principalComponents = pca.fit_transform(X_train.values.reshape(-1,1))
#     principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1'])
    ts_s = scoreLuminolALLData(X_train['Volume Flow RateRMS'].values)

    errors, cov, mean = Score_data(ts_s , X_train['Volume Flow RateRMS'].values)
    mahala_dist = []
    for e in errors:
        mahala_dist.append(Mahala_distantce(e, mean, cov))

    X_train['pca1_value'] = X_train['Volume Flow RateRMS']
    X_train['pca1_scores'] = mahala_dist
    X_train['pca1_scores_norm'] = scale(mahala_dist)

    
    q1_pc1, q3_pc1 = X_train['pca1_scores'].quantile([0.10, 0.75])
    iqr_pc1 = q3_pc1 - q1_pc1
    # Calculate upper and lower bounds for outlier for pc1
    lower_pc1 = q1_pc1 - (1.5*iqr_pc1)
    upper_pc1 = q3_pc1 + (1.5*iqr_pc1)
    # Filter out the outliers from the pc1
    X_train['outlier_pca1'] = ((X_train['pca1_scores']>upper_pc1) | (X_train['pca1_scores']<lower_pc1)).astype('int')
    
    
    # results predicting
    prediction = pd.Series(X_train['outlier_pca1'], 
                                index=df.index).rolling(3).median().fillna(0).replace(-1,0)
    
    # predicted outliers saving
    predicted_outlier.append(prediction)
    
    # predicted CPs saving
    prediction_cp = abs(prediction.diff())
    prediction_cp[0] = prediction[0]
    predicted_cp.append(prediction_cp)


# Results

true changepoint indices selection

In [None]:
# true changepoint indices selection
true_cp = [df.changepoint for df in list_of_df]

predicted_cp[0].plot(figsize=(12,3), label='predictions', marker='o', markersize=5)
true_cp[0].plot(marker='o', markersize=2)
plt.legend();

true outlier indices selection

In [None]:
# true outlier indices selection
true_outlier = [df.anomaly for df in list_of_df]

predicted_outlier[0].plot(figsize=(12,3), label='predictions', marker='o', markersize=5)
true_outlier[0].plot(marker='o', markersize=2)
plt.legend();

# Metrics calculation

binary classification metrics calculation

In [None]:
# binary classification metrics calculation
binary = evaluating_change_point(true_outlier, predicted_outlier, metric='binary', numenta_time='30 sec')

average detection delay metric calculation

In [None]:
# average detection delay metric calculation
add = evaluating_change_point(true_cp, predicted_cp, metric='average_delay', numenta_time='30 sec')

nab metric calculation

In [None]:
# nab metric calculation
nab = evaluating_change_point(true_cp, predicted_cp, metric='nab', numenta_time='30 sec')

#### Visualizations

In [None]:
X_train['anomaly'] = df['anomaly']
X_train['changepoint'] = df['changepoint']

In [None]:
# visualization
a = X_train.loc[X_train['anomaly'] == 1] 
_ = plt.figure(figsize=(18,6))
_ = plt.plot(X_train[['pca1_scores']], color='blue', label='Inline')
_ = plt.plot(a[['pca1_scores']], linestyle='none', marker='X', color='red', markersize=12, label='Anomaly')
_ = plt.xlabel('Series')
_ = plt.ylabel('Readings')
_ = plt.title('True Anomaly')
_ = plt.legend(loc='best')
plt.show();

In [None]:
# visualization
a = X_train.loc[X_train['outlier_pca1'] == 1] 
_ = plt.figure(figsize=(18,6))
_ = plt.plot(X_train[['pca1_scores']], color='blue', label='Inline')
_ = plt.plot(a[['pca1_scores']], linestyle='none', marker='X', color='red', markersize=12, label='Anomaly')
_ = plt.xlabel('Series')
_ = plt.ylabel('Readings')
_ = plt.title('Anomaly')
_ = plt.legend(loc='best')
plt.show();

In [None]:
N = X_train.shape[0]
plt.scatter(range(N),X_train['pca1_scores_norm'][:N].cumsum(),marker='1',label='PCA ')
plt.xlabel('Readings')
plt.ylabel('anomalies frequency')
plt.legend()
plt.show()

In [None]:
#2 -- Distributions of Predicted Probabilities of both classes
labels=['Positive','Negative']
plt.hist(X_train[X_train['outlier_pca1']==1]['pca1_scores_norm'], density=False, bins=100,
             alpha=.5, color='green',  label=labels[0])
plt.hist(X_train[X_train['outlier_pca1']==0]['pca1_scores_norm'], density=False, bins=100,
             alpha=.5, color='red', label=labels[1])
plt.axvline(.5, color='blue', linestyle='--', label='decision boundary')
# plt.xlim([0,1])
plt.title('Distributions', size=13)
plt.xlabel('Norm values', size=13)
plt.ylabel('Readings (norm.)', size=13)
plt.legend(loc="upper right")

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(X_train['anomaly'], X_train['outlier_pca1']))
confusion_matrix(X_train['anomaly'], X_train['outlier_pca1'])

In [None]:
print(classification_report(X_train['changepoint'], X_train['outlier_pca1']))
confusion_matrix(X_train['changepoint'], X_train['outlier_pca1'])

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(X_train['outlier_pca1'], X_train['anomaly'])

In [None]:
roc_auc_score(X_train['outlier_pca1'], X_train['changepoint'])