In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def Score_data(pred, real):
    # computing errors
    errors = np.abs(pred - real).flatten()
    # estimation
    mean = sum(errors)/len(errors)
    cov = 0
    for e in errors:
        cov += (e - mean)**2
    cov /= len(errors)

    print('mean : ', mean)
    print('cov : ', cov)
    return errors, cov, mean

# calculate Mahalanobis distance
def Mahala_distantce(x,mean,cov):
    return (x - mean)**2 / cov


def scale(A):
    return (A-np.min(A))/(np.max(A) - np.min(A))


def stats_dfs(path):
    df = pd.read_csv(path,sep=";")
    print("\n_________________\n")
    print(path)
    print("\n_________________\n")
    print(df.shape)
    print("\n_________________\n")
    print(df.anomaly.value_counts())
    print("\n_________________\n")
    print(df.anomaly.value_counts()/df.shape[0]*100)
    print("\n_________________\n")
    print(df.changepoint.value_counts())
    print("\n_________________\n")
    print(df.changepoint.value_counts()/df.shape[0]*100)
    return df


def stats_dfs_freeanomaly(path):
    df = pd.read_csv(path,sep=";")
    print("\n_________________\n")
    print(path)
    print("\n_________________\n")
    print(df.shape)
    print("\n_________________\n")
    return df


In [None]:
list_df_1 = ["/kaggle/input/skoltech-anomaly-benchmark-skab/SKAB/valve2/1.csv"]
df = stats_dfs(list_df_1[0])
df.head()

In [None]:
# df = df.loc[:550]
test = df.loc[550:]

In [None]:
list_a_free = ["/kaggle/input/skoltech-anomaly-benchmark-skab/SKAB/anomaly-free/anomaly-free.csv"]
df_a_free = stats_dfs_freeanomaly(list_a_free[0])
df_a_free.head()

In [None]:
raw_data = pd.read_csv("../input/benckmark-anomaly-timeseries-skab/alldata_skab.csv")
print(raw_data.columns)
print(raw_data.head())

print("anomaly ", raw_data.anomaly.value_counts())
print("changepoint ",raw_data.changepoint.value_counts())

# # Plotting
pd.DataFrame(raw_data[['Volume Flow RateRMS', 'anomaly', 'changepoint']].values, columns=['Volume Flow RateRMS', 'anomaly', 'changepoint'], index = raw_data.index).plot(figsize=(12,6))

plt.xlabel('Values')
plt.ylabel('Values')
plt.title('Residuals')
plt.show()

In [None]:
raw_data = df.copy()
raw_data.set_index('datetime')
# # Plotting
pd.DataFrame(raw_data.values, columns=raw_data.columns, index = raw_data.index).plot(figsize=(12,6))
plt.xlabel('Time')
plt.ylabel('Residuals')
plt.title('Residuals')
plt.show()

In [None]:
import matplotlib.pyplot as plt# Standardize/scale the dataset and apply PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
# Extract the names of the numerical columns

# x = df[['Accelerometer1RMS', 'Accelerometer2RMS', 'Current', 'Pressure', 'Temperature', 'Thermocouple', 'Voltage', 'Volume Flow RateRMS']]
x = df[['Volume Flow RateRMS']]

In [None]:
scaler = StandardScaler()
pca = PCA()
pipeline = make_pipeline(scaler, pca)
# pipeline.fit(x.values.reshape(-8, 8))
pipeline.fit(x.values.reshape(-1, 1))

In [None]:
# Plot the principal components against their inertia
features = range(pca.n_components_)
_ = plt.figure(figsize=(15, 5))
_ = plt.bar(features, pca.explained_variance_)
_ = plt.xlabel('PCA feature')
_ = plt.ylabel('Variance')
_ = plt.xticks(features)
_ = plt.title("Importance of the Principal Components based on inertia")
plt.show()

In [None]:
# # Calculate PCA with 8 components
# pca = PCA(n_components=8)
# principalComponents = pca.fit_transform(x.values.reshape(-8,8))
# principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8'])

# Calculate PCA with 1 components
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(x.values.reshape(-1,1))
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1'])


In [None]:
from statsmodels.tsa.stattools import adfuller
# Run Augmented Dickey Fuller Test
result = adfuller(principalDf['pc1'])
# Print p-value
print(result[1] >0.05, result[1])

The test have value very small number (much smaller than 0.05). Thus, I will reject the Null Hypothesis and say the data is stationary

## Using PCA1 component with AR model

In [None]:
from statsmodels.tsa.arima_model import ARIMA

# follow lag
model_ar = ARIMA(principalDf['pc1'].loc[550:], order=(1,1,0))  
results_ARIMA_ar = model_ar.fit(disp=-1)

In [None]:
# Forecast
fc, se, conf = results_ARIMA_ar.forecast(513, alpha=0.05)  # 95% conf

In [None]:
# Make as pandas series
fc_series = pd.Series(fc, index=test.index)
lower_series = pd.Series(conf[:, 0], index=test.index)
upper_series = pd.Series(conf[:, 1], index=test.index)

# Plot
plt.figure(figsize=(12,5), dpi=100)
plt.plot(principalDf['pc1'].loc[550:], label='training') # 550, train
plt.plot(principalDf['pc1'].loc[:550], label='actual')  # 513, test
plt.plot(fc_series, label='forecast')
plt.fill_between(lower_series.index, lower_series, upper_series, 
                 color='k', alpha=.15)
plt.title('Forecast vs Actuals')
plt.legend(loc='upper left', fontsize=8)
plt.show()

In [None]:
errors, cov, mean = Score_data(fc_series.values , principalDf['pc1'].loc[550:].values)

mahala_dist = []
for e in errors:
    mahala_dist.append(Mahala_distantce(e, mean, cov))



In [None]:
test['pca1_value'] = principalDf['pc1'].loc[550:]
test['pca1_scores'] = mahala_dist

test['pca1_scores_norm'] = scale(mahala_dist)
plt.figure(figsize=(12, 8))
plt.hist(test['pca1_scores_norm'], bins=50);

In [None]:
q1_pc1, q3_pc1 = test['pca1_scores'].quantile([0.10, 0.60])
iqr_pc1 = q3_pc1 - q1_pc1

# Calculate upper and lower bounds for outlier for pc1
lower_pc1 = q1_pc1 - (1.5*iqr_pc1)
upper_pc1 = q3_pc1 + (1.5*iqr_pc1)
# Filter out the outliers from the pc1
test['outlier_pca1'] = ((test['pca1_scores']>upper_pc1) | (test['pca1_scores']<lower_pc1)).astype('int')
test['outlier_pca1'].value_counts()

In [None]:
# fig, axes = plt.subplots(nrows=2, figsize=(15,10))
# axes[0].plot(test[['pca1_scores']], color='blue')
# axes[1].plot(np.array(mahala_dist).ravel(), color='red')

# axes[0].set_title('original data', fontsize=20)
# axes[1].set_title('outlier score', fontsize=20)

# # axes[0].grid()
# # axes[1].grid()
# plt.tight_layout()
# plt.show()

In [None]:
# visualization
a = test.loc[test['anomaly'] == 1] 
_ = plt.figure(figsize=(18,6))
_ = plt.plot(test[['pca1_scores']], color='blue', label='Inline')
_ = plt.plot(a[['pca1_scores']], linestyle='none', marker='X', color='red', markersize=12, label='Anomaly')
_ = plt.xlabel('Series')
_ = plt.ylabel('Readings')
_ = plt.title('True Anomaly')
_ = plt.legend(loc='best')
plt.show();

In [None]:
# visualization
a = test.loc[test['outlier_pca1'] == 1] 
_ = plt.figure(figsize=(18,6))
_ = plt.plot(test[['pca1_scores']], color='blue', label='Inline')
_ = plt.plot(a[['pca1_scores']], linestyle='none', marker='X', color='red', markersize=12, label='Anomaly')
_ = plt.xlabel('Series')
_ = plt.ylabel('Readings')
_ = plt.title('Anomaly')
_ = plt.legend(loc='best')
plt.show();

In [None]:
N = test.shape[0]
plt.scatter(range(N),test['pca1_scores_norm'][:N].cumsum(),marker='1',label='PCA ')
plt.xlabel('Readings')
plt.ylabel('anomalies frequency')
plt.legend()
plt.show()

In [None]:
#2 -- Distributions of Predicted Probabilities of both classes
labels=['Positive','Negative']
plt.hist(test[test['outlier_pca1']==1]['pca1_scores_norm'], density=False, bins=100,
             alpha=.5, color='green',  label=labels[0])
plt.hist(test[test['outlier_pca1']==0]['pca1_scores_norm'], density=False, bins=100,
             alpha=.5, color='red', label=labels[1])
plt.axvline(.5, color='blue', linestyle='--', label='decision boundary')
# plt.xlim([0,1])
plt.title('Distributions', size=13)
plt.xlabel('Norm values', size=13)
plt.ylabel('Readings (norm.)', size=13)
plt.legend(loc="upper right")

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(test['anomaly'], test['outlier_pca1']))
confusion_matrix(test['anomaly'], test['outlier_pca1'])

In [None]:
print(classification_report(test['changepoint'], test['outlier_pca1']))
confusion_matrix(test['changepoint'], test['outlier_pca1'])

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(test['outlier_pca1'], test['anomaly'])

In [None]:
roc_auc_score(test['outlier_pca1'], test['changepoint'])