In [None]:
#这一部分是导入数据的步骤，可以忽略
# 此处将numpy和pandas包进行导入 其中numpy为一个数组处理模块，pandas为一个数据框处理模块
# 数据框概念来自于数据分析原语言R,其形式和Excel的一张表相似
# import 1  as 2  此时1为要导入的包，2为1的别名，as实际上是给1一个简单的别名，易于我们在后面进行调用。
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

读取csv文件

此处调用pandas的read_csv函数，其中可以指定是否读取首行首列，是否解析时间，是否指定索引

此处我们只是读取csv，并未指定其他参数，实际上使用的是默认的参数，具体的参数使用请见[官网](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)

In [None]:
df=pd.read_csv("/kaggle/input/nab/realKnownCause/realKnownCause/ec2_request_latency_system_failure.csv")

这个数据是一台电脑的CPU使用率数据，可以知道数据范围应该为0-100

In [None]:
#查看csv文件内容
df.head(2)

通过查看内容我们可以看到，数据里面有两列数据，其中一列为时间戳，一列为值。可以知道我们现在检测的只是其值的异常，这里检测的并不是时间序列上的异常。这是因为我们的数据在时间序列上：1、无趋势性  2、无周期性  3、无季节性  ，所以检测的只是值的异常，而非从时间序列上对异常值进行观测

In [None]:
#查看描述性统计信息
df.describe()

通过描述性统计我们可以看到，总共有4032行数据，其中平均值为45（代表平均使用率为45%），标准差为2.28(代表方差较小，可知数据大多集中分布在平均值左右)，最小值为22，最大值为99，中位数为45.

25%的值小于43, 50%小于45 ， 75%小于46. 那么可以先考虑从统计学意义上计算异常值。计算IQR=Q3-Q1=46-43=3 ,所以正常范围为43-1.5\*3，46+1.5\*3，则为38.5-50.5，超出此范围则可认为是异常值。

In [None]:
#转换为时间戳格式

df['timestamp']=pd.to_datetime(df['timestamp'])

In [None]:
#画出线形图

import plotly.express as px

px.line(df,x='timestamp',y='value')

我们可以从图中发现三个特别明显的异常值

In [None]:
df['hour']=df['timestamp'].dt.hour
px.box(df,x='hour',y='value')

大量的异常值出现在三点

In [None]:
px.histogram(df['value'])

画出直方图，可以看出数据多分布在40-50以内

In [None]:
#从sklearn中导入相应的算法
# 导入OneClassSVM
from sklearn.svm import OneClassSVM
# 导入IsolationForest
from sklearn.ensemble import IsolationForest
# 导入LocalOutlierFactor
from sklearn.neighbors import LocalOutlierFactor

我们使用EM值来评估异常检测的性能

In [None]:
#此部分不用看
import numpy as np
from sklearn.metrics import auc

#Source:https://github.com/ngoix/EMMV_benchmarks/blob/master/em.py

def em(t, t_max, volume_support, s_unif, s_X, n_generated):
    EM_t = np.zeros(t.shape[0])
    n_samples = s_X.shape[0]
    s_X_unique = np.unique(s_X)
    EM_t[0] = 1.
    for u in s_X_unique:
        # if (s_unif >= u).sum() > n_generated / 1000:
        EM_t = np.maximum(EM_t, 1. / n_samples * (s_X > u).sum() -
                          t * (s_unif > u).sum() / n_generated
                          * volume_support)
    amax = np.argmax(EM_t <= t_max) + 1
    if amax == 1:
        print("failed to achieve t_max")
        amax = -1
    AUC = auc(t[:amax], EM_t[:amax])
    return AUC, EM_t, amax

In [None]:
#查看数据框样式
df.shape

可见共有4032行数据，每行数据有两列，即是两个维度

In [None]:
#此部分不用看
# parameters of the algorithm:
n_generated = 100000
t_max = 0.9

lim_inf = df['value'].values.min(axis=0)
lim_sup = df['value'].values.max(axis=0)
volume_support = (lim_sup - lim_inf).prod()
t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
unif = np.random.uniform(lim_inf, lim_sup,size=(n_generated, 1))

#### One Class SVM

In [None]:
#实例化算法类
one_svm=OneClassSVM()
# 拟合并预测
one_svm_result=one_svm.fit_predict(df['value'].values.reshape(-1,1))
#定义新数据框，存储预测结果
one_svm_result_df=pd.DataFrame()
one_svm_result_df['timestamp']=df['timestamp']
one_svm_result_df['value'] = df['value']

#把-1的标签修改为1，使得后面的画图过程更加标准，不需要修改
one_svm_result_df['anomaly']  = [1 if i==-1 else 0 for i in one_svm_result]

#此部分不用看
s_X_ocsvm = one_svm.decision_function(df['value'].values.reshape(-1,1)).reshape(1, -1)[0]
s_unif_ocsvm = one_svm.decision_function(unif).reshape(1, -1)[0]
auc_ocsvm, em_ocsvm, amax_ocsvm = em(t, t_max, volume_support,s_unif_ocsvm, s_X_ocsvm, n_generated)

In [None]:
#we will store the EM values for all the models in a list
#此部分不用看
em_values=[]
model_name=[]
em_values.append(em_ocsvm.mean())
model_name.append("One Clas SVM")

In [None]:
#统计anomaly列中的值
one_svm_result_df['anomaly'].value_counts()

In [None]:
#导入plotly进行画图
import plotly.graph_objects as go

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=one_svm_result_df['timestamp'], y=one_svm_result_df['value'],
                    mode='lines',
                    name='lines'))

a=one_svm_result_df[one_svm_result_df['anomaly']==1]

fig.add_trace(go.Scatter(x=a.timestamp, y=a.value,
                    mode='markers',
                    name='markers'))

fig.update_layout(title='Anomaly detection using One Class SVM')
fig.show("notebook")

可见异常值较多，并非我们所期望的，于是重新调整nu值，修改参数来拟合结果。、

In [None]:
# 对此处nu值进行多次尝试,也可尝试其他参数值，找到看起来最好的结果即可，算法并不能完全好的拟合数据，这是正常的情况
one_svm=OneClassSVM(nu=0.03)
one_svm_result=one_svm.fit_predict(df['value'].values.reshape(-1,1))
one_svm_result_df=pd.DataFrame()
one_svm_result_df['timestamp']=df['timestamp']
one_svm_result_df['value'] = df['value']

#Inliers are labeled 1, while outliers are labeled -1.
one_svm_result_df['anomaly']  = [1 if i==-1 else 0 for i in one_svm_result]
s_X_ocsvm = one_svm.decision_function(df['value'].values.reshape(-1,1)).reshape(1, -1)[0]
s_unif_ocsvm = one_svm.decision_function(unif).reshape(1, -1)[0]
auc_ocsvm, em_ocsvm, amax_ocsvm = em(t, t_max, volume_support,s_unif_ocsvm, s_X_ocsvm, n_generated)

#we will store the EM values for all the models in a list

em_values=[]
model_name=[]
em_values.append(em_ocsvm.mean())
model_name.append("One Clas SVM")

import plotly.graph_objects as go

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=one_svm_result_df['timestamp'], y=one_svm_result_df['value'],
                    mode='lines',
                    name='lines'))

a=one_svm_result_df[one_svm_result_df['anomaly']==1]

fig.add_trace(go.Scatter(x=a.timestamp, y=a.value,
                    mode='markers',
                    name='markers'))

fig.update_layout(title='Anomaly detection using One Class SVM')
fig.show("notebook")

可以看到用ocsvm来进行预测时得到的效果是不太好，此时就可以考虑使用其他的算法进行处理，而不要只使用这一种算法

In [None]:
#例如我使用统计学的方式进行处理，认为超出范围的即为异常值

def detect_IQR(df,feature):
    下四分位数 = df[feature].quantile(q=0.25)
    上四分位数 = df[feature].quantile(q=0.75)
    IQR = 上四分位数-下四分位数
    下界点= 下四分位数 - 3*IQR
    上界点 = 上四分位数 + 3*IQR
    
    return 下界点,上界点
下界点,上界点 = detect_IQR(df,"value")
print(下界点,上界点)
IQR_result_df=pd.DataFrame()
IQR_result_df['timestamp']=df['timestamp']
IQR_result_df['value'] = df['value']

#Inliers are labeled 1, while outliers are labeled -1.
IQR_result_df.loc[(IQR_result_df["value"]<下界点) | (IQR_result_df["value"]>上界点),"anomaly"] = 1

import plotly.graph_objects as go

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=IQR_result_df['timestamp'], y=IQR_result_df['value'],
                    mode='lines',
                    name='lines'))

a=IQR_result_df[IQR_result_df['anomaly']==1]

fig.add_trace(go.Scatter(x=a.timestamp, y=a.value,
                    mode='markers',
                    name='markers'))

fig.update_layout(title='Anomaly detection using IQR')
fig.show("notebook")

可以看到利用统计学进行计算得到的结果是比较好的，也是比较符合我们的期望的。

In [None]:
#下面我们使用统计学计算出的异常值认为是真实的异常值，对于OneClassSVM进行参数的调整。
df_true = IQR_result_df
print(df_true.head())
#处理空值
df_true.loc[df_true["anomaly"].isnull(),"anomaly"] = 0
df_true["anomaly"] = df_true["anomaly"].astype("int")
print(df_true["anomaly"].unique())

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import f1_score,precision_score,recall_score

X_trainval,X_test,y_trainval,y_test = train_test_split(df_true["value"].values.reshape(-1,1),df_true["anomaly"].values.reshape(-1,1),random_state=0)
X_train ,X_val,y_train,y_val = train_test_split(X_trainval,y_trainval,random_state=1)
# grid search start
best_score = 0
for nu in [0.001,0.0001,0.004,0.01,0.03,0.05,0.00001,0.000001]:
    # 对于每种参数可能的组合，进行一次训练
    ocsvm = OneClassSVM(nu=nu)
    # 5 折交叉验证
#     scores = cross_val_score(ocsvm,X_trainval,y_trainval,cv=5,scoring='recall_micro')
#     score = scores.mean()
#     print(nu)
#     print(score)
    trainX = X_train[y_train==0].reshape(-1,1)
    ocsvm.fit(trainX)
    yhat = ocsvm.predict(X_val)
    # mark inliers 1, outliers -1
    y_val[y_val == 1] = -1
    y_val[y_val == 0] = 1
    # calculate score
    score = f1_score(y_val, yhat, pos_label=-1)
    print(nu,score)
#     score = precision_score(y_val, yhat, pos_label=-1)
#     score = recall_score(y_val, yhat, pos_label=-1)
    # 找到表现最好的参数
    # 找到表现最好的参数
    if score > best_score:
        best_score = score
        best_parameters= {'nu':nu}
        print(best_parameters,best_score)

可以看到nu值越小，f1值越大，说明此时效果比较好。


In [None]:
ocsvm = OneClassSVM(nu=0.000001)
#选取全部为正例的数据，用这部分数据来拟合模型。
trainX = X_train[y_train==0].reshape(-1,1)
ocsvm.fit(trainX)
y_true = ocsvm.predict(df_true["value"].values.reshape(-1,1))
one_svm_result_df=pd.DataFrame()
one_svm_result_df['timestamp']=df_true['timestamp']
one_svm_result_df['value'] = df_true['value']

#Inliers are labeled 1, while outliers are labeled -1.
one_svm_result_df['anomaly']  = [1 if i==-1 else 0 for i in y_true]

import plotly.graph_objects as go

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=IQR_result_df['timestamp'], y=IQR_result_df['value'],
                    mode='lines',
                    name='lines'))

a=IQR_result_df[IQR_result_df['anomaly']==1]

fig.add_trace(go.Scatter(x=a.timestamp, y=a.value,
                    mode='markers',
                    name='markers'))

fig.update_layout(title='Anomaly detection using OneClassSVM')
fig.show("notebook")

In [None]:
one_svm_result

总体上来看我们认为ocsvm的效果并不是最好

### Isolation Forest

因此我们可以选择使用孤立森林来进行异常检测

* The lower, the more abnormal.
* Negative scores represent outliers, positive scores represent inliers.

In [None]:
iso=IsolationForest()
iso_result=iso.fit_predict(df['value'].values.reshape(-1,1))
iso_result_df=pd.DataFrame()
iso_result_df['timestamp']=df['timestamp']
iso_result_df['value'] = df['value']

#Inliers are labeled 1, while outliers are labeled -1.
iso_result_df['anomaly']  = [1 if i==-1 else 0 for i in iso_result]
s_X_iso = iso.decision_function(df['value'].values.reshape(-1,1)).reshape(1, -1)[0]
s_unif_iso = iso.decision_function(unif).reshape(1, -1)[0]
auc_iso, em_iso, amax_iso = em(t, t_max, volume_support,s_unif_iso, s_X_iso, n_generated)

In [None]:
em_values.append(em_iso.mean())
model_name.append("Isolation Forest")

In [None]:
iso_result_df['anomaly'].value_counts()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=iso_result_df['timestamp'], y=iso_result_df['value'],
                    mode='lines',
                    name='lines'))

a=iso_result_df[iso_result_df['anomaly']==1]

fig.add_trace(go.Scatter(x=a.timestamp, y=a.value,
                    mode='markers',
                    name='markers'))

fig.update_layout(title='使用孤立森林检测CPU使用率异常')
fig.show("notebook")

In [None]:
iso=IsolationForest(contamination=0.008)
iso_result=iso.fit_predict(df['value'].values.reshape(-1,1))
iso_result_df=pd.DataFrame()
iso_result_df['timestamp']=df['timestamp']
iso_result_df['value'] = df['value']

#Inliers are labeled 1, while outliers are labeled -1.
iso_result_df['anomaly']  = [1 if i==-1 else 0 for i in iso_result]

import plotly.graph_objects as go

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=iso_result_df['timestamp'], y=iso_result_df['value'],
                    mode='lines',
                    name='lines'))

a=iso_result_df[iso_result_df['anomaly']==1]

fig.add_trace(go.Scatter(x=a.timestamp, y=a.value,
                    mode='markers',
                    name='markers'))

fig.update_layout(title='使用孤立森林检测CPU使用率异常')
fig.show("notebook")

可以看到孤立森林算法比较好的找到了这个数据集中的异常值
### Local Outlier Factor

In [None]:
lof=LocalOutlierFactor(novelty=True)
lof.fit(df['value'].values.reshape(-1,1))
lof_result=lof.predict(df['value'].values.reshape(-1,1))
lof_result_df=pd.DataFrame()
lof_result_df['timestamp']=df['timestamp']
lof_result_df['value'] = df['value']

#Inliers are labeled 1, while outliers are labeled -1.
lof_result_df['anomaly']  = [1 if i==-1 else 0 for i in lof_result]

#decision_function is not available when novelty=False. If we make novelty=True, then fit_predict
#is not available

"""
The decision_function method is also defined from the scoring function, 
in such a way that negative values are outliers and non-negative ones are inliers.
"""
s_X_lof = lof.decision_function(df['value'].values.reshape(-1,1))
s_unif_lof = lof.decision_function(unif).reshape(1, -1)
auc_lof, em_lof, amax_lof = em(t, t_max, volume_support,s_unif_lof, s_X_lof, n_generated)

In [None]:
em_values.append(em_lof.mean())
model_name.append("LOF")

In [None]:
lof_result_df['anomaly'].value_counts()

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=lof_result_df['timestamp'], y=lof_result_df['value'],
                    mode='lines',
                    name='lines'))

a=lof_result_df[lof_result_df['anomaly']==1]

fig.add_trace(go.Scatter(x=a.timestamp, y=a.value,
                    mode='markers',
                    name='markers'))

fig.update_layout(title='Anomaly detection using LOF')
fig.show("notebook")

### GMM

Source: [Link to Github](https://github.com/rhasanbd/Anomaly-Detection-LOF-IsolationForest-FactMCD-GMM/blob/master/Anomaly%20Detection-LOF-IsolationForest-FastMCD-GMM.ipynb)

To determine whether a data point is an anomaly we need to compute the log-likelihood of the given data.

We use the "score" method of GMM to compute the per-sample average log-likelihood of the data.

Then, compare the likelihood values with the density threshold.

we identify the outliers using the first percentile lowest density as the threshold. I.e., approximately 1% of the instances will be flagged as anomalies.

In [None]:
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(random_state=0)
gm.fit(df['value'].values.reshape(-1,1))

densities = gm.score_samples(df['value'].values.reshape(-1,1))
density_threshold = np.percentile(densities, 1)

In [None]:
gm_result= [-1 if i<density_threshold else 0 for i in densities]

In [None]:
gm_result_df=pd.DataFrame()
gm_result_df['timestamp']=df['timestamp']
gm_result_df['value'] = df['value']

gm_result_df['anomaly']  = [1 if i==-1 else 0 for i in gm_result]
s_X_gm = gm.score_samples(df['value'].values.reshape(-1,1)).reshape(1, -1)[0]
s_unif_gm = gm.score_samples(unif).reshape(1, -1)[0]
auc_gm, em_gm, amax_gm = em(t, t_max, volume_support,s_unif_gm, s_X_gm, n_generated)

In [None]:
gm_result_df['anomaly'].value_counts()

In [None]:
em_values.append(em_gm.mean())
model_name.append("GMM")

In [None]:
import plotly.graph_objects as go

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=gm_result_df['timestamp'], y=gm_result_df['value'],
                    mode='lines',
                    name='lines'))

a=gm_result_df[gm_result_df['anomaly']==1]

fig.add_trace(go.Scatter(x=a.timestamp, y=a.value,
                    mode='markers',
                    name='markers'))

fig.update_layout(title='Anomaly detection using GMM')
fig.show("notebook")

In [None]:
final_result={}

final_result={'Model Name':model_name,'EM Value':em_values}
final_result_df=pd.DataFrame(final_result)

In [None]:
final_result_df

Higher EM value corresponds to a better model. In this case, Isolation Forest has performed the best followed by GMM. 