In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

To use PyODDS we have to install the required dependencies. These are the following:
- pandas>=0.25.0
- taos==1.4.15
- tensorflow==2.0.0b1
- numpy>=1.16.4
- seaborn>=0.9.0
- torch>=1.1.0
- luminol==0.4
- tqdm>=4.35.0
- matplotlib>=3.1.1
- scikit_learn>=0.21.3

In [None]:
# Required dependencies for PyODDS

# pandas >= 0.25.0
# taos == 1.4.15
!pip install taos
# tensorflow == 2.0.0b1
!pip install tensorflow==2.0.0b1
# numpy >= 1.16.4
np.version.version
# seaborn >= 0.9.0
import seaborn as sns
sns.__version__
# torch >= 1.1.0
import torch
torch.__version__
# luminol == 0.4
!pip install luminol==0.4
# tqdm >= 4.35.0
import tqdm
tqdm.__version__
# matplotlib >=3.1.1
import matplotlib
matplotlib.__version__
# scikit_learn >= 0.21.3
!pip install scikit_learn==0.21.3

Then we install PyODDS.

In [None]:
# We will use PyODDs. For this we have to install PyODDS and its required dependencies.
!pip install pyodds
import pyodds as pyodds

The datasets are from the NAB corpus. Two datasets are used:
- an artificially generated dataset with anomaly (df1)
- a real traffic dataset (df2)

In [None]:
# The datasets are from the NAB corpus. 
# The first is an artificial dataset with anomaly.
df1 = pd.read_csv('../input/nab/artificialWithAnomaly/artificialWithAnomaly/art_daily_flatmiddle.csv')
print('ARTIFICIAL DATASET\n')
print(df1.info())
print(df1['timestamp'].head(10))

df1.plot(x='timestamp', y='value')


# The second is a real traffic occupancy dataset.
df2 = pd.read_csv('../input/nab/realTraffic/realTraffic/occupancy_t4013.csv')
print('\n\n\nREAL DATASET\n')
print(df2.info())
print(df2['timestamp'].head(10))

df2.plot(x='timestamp', y='value')


In [None]:
# Converting timestamp object into float
df1['timestamp'] = pd.to_datetime(df1['timestamp'])
df1['timestamp'] = [d.timestamp() for d in df1['timestamp']]

df2['timestamp'] = pd.to_datetime(df2['timestamp'])
df2['timestamp'] = [d.timestamp() for d in df2['timestamp']]

In [None]:
# Splitting the datasets to train and test sets. Approximately 70% of the data is used to train, and the remaining 30% to test.
df1_train = df1[:2822]
df1_test = df1[2822:]


df2_train = df2[:1750]
df2_test = df2[1750:]

In [None]:
from pyodds.utils.importAlgorithm import algorithm_selection
from pyodds.utils.plotUtils import visualize_outlierscore,visualize_distribution, visualize_distribution_static, visualize_distribution_time_serie
from pyodds.utils.utilities import output_performance
import matplotlib.pyplot as plt

# selecting the knn algorithm
clf_knn = algorithm_selection('knn', random_state=9, contamination=0.1)



# on the first dataset
clf_knn.fit(df1_train)

# get outlier result and scores
prediction_result = clf_knn.predict(df1_test)
outlierness_score = clf_knn.decision_function(df1_test)

# visualize the prediction_result
visualize_distribution(df1_test,prediction_result,outlierness_score)
visualize_distribution_static(df1_test,prediction_result,outlierness_score)




# on the second dataset
clf_knn.fit(df2_train)

# get outlier result and scores
prediction_result = clf_knn.predict(df2_test)
outlierness_score = clf_knn.decision_function(df2_test)

# visualize the prediction_result
visualize_distribution(df2_test,prediction_result,outlierness_score)
visualize_distribution_static(df2_test,prediction_result,outlierness_score)



In [None]:
# selecting the iforest algorithm
clf_if = algorithm_selection('iforest', random_state=9, contamination=0.1)



# on the first dataset
clf_if.fit(df1_train)

# get outlier result and scores
prediction_result = clf_if.predict(df1_test)
outlierness_score = clf_if.decision_function(df1_test)

# visualize the prediction_result
visualize_distribution(df1_test,prediction_result,outlierness_score)
visualize_distribution_static(df1_test,prediction_result,outlierness_score)




# on the second dataset
clf_if.fit(df2_train)

# get outlier result and scores
prediction_result = clf_if.predict(df2_test)
outlierness_score = clf_if.decision_function(df2_test)

# visualize the prediction_result
visualize_distribution(df2_test,prediction_result,outlierness_score)
visualize_distribution_static(df2_test,prediction_result,outlierness_score)

