Skip to content

Commit

Permalink
Merge pull request #1 from sinhrks/input_types
Browse files Browse the repository at this point in the history
ENH: support non pandas input
  • Loading branch information
sinhrks committed Sep 24, 2016
2 parents 847fb49 + f7d9599 commit 74d8150
Show file tree
Hide file tree
Showing 10 changed files with 341 additions and 69 deletions.
28 changes: 17 additions & 11 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,29 +1,35 @@
pydetect
========

.. image:: https://readthedocs.org/projects/pydetect/badge/?version=latest
:target: http://pydetect.readthedocs.org/en/latest/
:alt: Latest Docs
.. image:: https://travis-ci.org/sinhrks/pydetect.svg?branch=master
:target: https://travis-ci.org/sinhrks/pydetect
.. image:: https://coveralls.io/repos/sinhrks/pydetect/badge.svg?branch=master&service=github
:target: https://coveralls.io/github/sinhrks/pydetect?branch=master

Change point and anomaly detections for time-series.
See `notebook <https://github.com/sinhrks/pydetect/tree/master/notebook>`_ to check basic usage.

Change point detection
----------------------

Mean or / and variance shift (at most one change)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"""""""""""""""""""""""""""""""""""""""""""""""""

- CUMSUM statistics

- ``MeanDetector``
- ``VarianceDetector``
- ``MeanVarianceDetector``
- ``MeanDetector``
- ``VarianceDetector``
- ``MeanVarianceDetector``

Anomaly detection
-----------------

Generalized ESD Test
^^^^^^^^^^^^^^^^^^^^
""""""""""""""""""""

- ``GESDDetector``

see http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm

- ``TimeSeriesGESDDetector``

decompose time-series to trend, seasonal and residuals. Then perform
generalized ESD test to residuals.
See http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm
18 changes: 13 additions & 5 deletions notebook/changepoint.ipynb

Large diffs are not rendered by default.

152 changes: 142 additions & 10 deletions notebook/outlier.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pydetect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
from pydetect.changepoint import (MeanDetector, VarianceDetector, # noqa
MeanVarianceDetector) # noqa
import pydetect.datasets as datasets # noqa
from pydetect.outlier import GESDDetector, TimeSeriesGESDDetector # noqa
from pydetect.outlier import GESDDetector # noqa
48 changes: 41 additions & 7 deletions pydetect/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import pandas as pd


# ToDo: maybe needs generic detector and time series detector?


class BaseDetector(object):

def __init__(self, decompose=False):
self.decompose = decompose

def detect(self, data):
raise NotImplementedError

Expand All @@ -18,18 +18,52 @@ def get_statistics(self, data):

def _validate(self, data):
""" validate data """
if not isinstance(data, pd.Series):
raise ValueError('Input must be pd.Series')
self._original = data

if self.decompose:
data = self._decompose(data)

data = np.asarray(data)
if data.ndim > 2:
raise ValueError('Input must be less than 2 dimentions')
elif data.ndim == 2 and data.shape[1] > 1:
raise ValueError('Input must be univariate')

return data

def _decompose(self, data):
try:
import statsmodels.api as sm
except ImportError:
msg = ('statsmodels >= 0.6.0 is required to perform '
'seasonal decomposition')
raise ImportError(msg)

# if not isinstance(data, pd.Series):
# raise ValueError('Input must be pd.Series')

# ToDo:
# - check DatetimeIndex and freq

return data
decomposed = sm.tsa.seasonal_decompose(data)
resid = decomposed.resid

self._decompose_indexer = pd.notnull(resid.values)
resid = resid[self._decompose_indexer]
return resid

def _wrap_result(self, data, result):
""" wrap result to be compat with data """

result = data._constructor(result, index=data.index)
if self.decompose:
# pad NaN
pad = np.zeros(len(self._original))
pad[self._decompose_indexer] = result
result = pad

if isinstance(self._original, (pd.Series, pd.DataFrame)):
index = self._original.index
result = self._original._constructor(result, index=index)
return result


Expand Down
25 changes: 13 additions & 12 deletions pydetect/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,18 @@


def get_nile():
return pd.Series([1120, 1160, 963, 1210, 1160, 1160, 813, 1230, 1370, 1140,
995, 935, 1110, 994, 1020, 960, 1180, 799, 958, 1140,
1100, 1210, 1150, 1250, 1260, 1220, 1030, 1100, 774, 840,
874, 694, 940, 833, 701, 916, 692, 1020, 1050, 969, 831,
726, 456, 824, 702, 1120, 1100, 832, 764, 821, 768, 845,
864, 862, 698, 845, 744, 796, 1040, 759, 781, 865, 845,
944, 984, 897, 822, 1010, 771, 676, 649, 846, 812, 742,
801, 1040, 860, 874, 848, 890, 744, 749, 838, 1050, 918,
986, 797, 923, 975, 815, 1020, 906, 901, 1170, 912, 746,
919, 718, 714, 740],
index=pd.date_range('1871-01-01', freq='AS', periods=100))
values = [1120, 1160, 963, 1210, 1160, 1160, 813, 1230, 1370, 1140,
995, 935, 1110, 994, 1020, 960, 1180, 799, 958, 1140,
1100, 1210, 1150, 1250, 1260, 1220, 1030, 1100, 774, 840,
874, 694, 940, 833, 701, 916, 692, 1020, 1050, 969,
831, 726, 456, 824, 702, 1120, 1100, 832, 764, 821,
768, 845, 864, 862, 698, 845, 744, 796, 1040, 759,
781, 865, 845, 944, 984, 897, 822, 1010, 771, 676,
649, 846, 812, 742, 801, 1040, 860, 874, 848, 890,
744, 749, 838, 1050, 918, 986, 797, 923, 975, 815,
1020, 906, 901, 1170, 912, 746, 919, 718, 714, 740]
index = pd.date_range('1871-01-01', freq='AS', periods=100)
return pd.Series(values, index=index, name='Nile')


def get_airpassengers():
Expand All @@ -32,4 +33,4 @@ def get_airpassengers():
360, 342, 406, 396, 420, 472, 548, 559, 463, 407, 362, 405,
417, 391, 419, 461, 472, 535, 622, 606, 508, 461, 390, 432]
index = pd.date_range('1949-01-01', freq='MS', periods=144)
return pd.Series(values, index=index)
return pd.Series(values, index=index, name='Air Passengers')
26 changes: 4 additions & 22 deletions pydetect/outlier.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import numpy as np
from scipy.stats import t, zscore
import pandas as pd

from pydetect.base import OutlierDetector

Expand All @@ -16,9 +15,10 @@ class GESDDetector(OutlierDetector):
http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm
"""

def __init__(self, alpha=0.05, max_outliers=None):
self.alpha = 0.05
self.max_outliers = None
def __init__(self, alpha=0.05, max_outliers=None, decompose=False):
self.alpha = alpha
self.max_outliers = max_outliers
super(GESDDetector, self).__init__(decompose=decompose)

def detect(self, data):
data = self._validate(data)
Expand Down Expand Up @@ -89,21 +89,3 @@ def get_statistics(self, data):
outliers = (outliers != 0) & (outliers <= n_outliers)

return outliers, np.array(r), np.array(l)


class TimeSeriesGESDDetector(GESDDetector):

def detect(self, data):
import statsmodels.api as sm

decomposed = sm.tsa.seasonal_decompose(data)
resid = decomposed.resid

notnull_indexer = pd.notnull(resid.values)
resid = resid[notnull_indexer]

indexer, _, _ = self.get_statistics(resid)
result = np.zeros(len(data))
result[notnull_indexer] = indexer
result = self._wrap_result(data, result)
return result
71 changes: 70 additions & 1 deletion pydetect/tests/test_changepoint.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
#!/usr/bin/env python

import pydetect

import numpy as np
import pandas as pd
import pandas.util.testing as tm


class TestMeanDetector(tm.TestCase):

def test_nile(self):
def test_nile_statistics(self):
nile = pydetect.datasets.get_nile()

d = pydetect.MeanDetector()
Expand All @@ -16,6 +19,28 @@ def test_nile(self):
self.assertEqual(2835156.75, n)
self.assertEqual(1597457.1944444478, v)

def test_input_class(self):
nile = pydetect.datasets.get_nile()
d = pydetect.MeanDetector()

exp = np.array([0.] * 100)
exp[27] = 1.
exp_index = pd.date_range('1871-01-01', freq='AS', periods=100)
# name will be reset
exp = pd.Series(exp, exp_index)

res = d.detect(nile)
tm.assert_series_equal(res, exp)

res = d.detect(nile.to_frame())
tm.assert_frame_equal(res, exp.to_frame())

res = d.detect(np.array(nile))
tm.assert_numpy_array_equal(res, exp.values)

res = d.detect(nile.tolist())
tm.assert_numpy_array_equal(res, exp.values)


class TestVarianceDetector(tm.TestCase):

Expand All @@ -29,6 +54,28 @@ def test_nile(self):
self.assertEqual(1025.2437598632698, n)
self.assertEqual(1013.6146385197355, v)

def test_input_class(self):
nile = pydetect.datasets.get_nile()
d = pydetect.VarianceDetector()

exp = np.array([0.] * 100)
exp[46] = 1.
exp_index = pd.date_range('1871-01-01', freq='AS', periods=100)
# name will be reset
exp = pd.Series(exp, exp_index)

res = d.detect(nile)
tm.assert_series_equal(res, exp)

res = d.detect(nile.to_frame())
tm.assert_frame_equal(res, exp.to_frame())

res = d.detect(np.array(nile))
tm.assert_numpy_array_equal(res, exp.values)

res = d.detect(nile.tolist())
tm.assert_numpy_array_equal(res, exp.values)


class TestMeanVarianceDetector(tm.TestCase):

Expand All @@ -41,3 +88,25 @@ def test_nile(self):
self.assertEqual(27, i)
self.assertEqual(1025.2437598632698, n)
self.assertEqual(967.68788456436801, v)

def test_input_class(self):
nile = pydetect.datasets.get_nile()
d = pydetect.MeanVarianceDetector()

exp = np.array([0.] * 100)
exp[27] = 1.
exp_index = pd.date_range('1871-01-01', freq='AS', periods=100)
# name will be reset
exp = pd.Series(exp, exp_index)

res = d.detect(nile)
tm.assert_series_equal(res, exp)

res = d.detect(nile.to_frame())
tm.assert_frame_equal(res, exp.to_frame())

res = d.detect(np.array(nile))
tm.assert_numpy_array_equal(res, exp.values)

res = d.detect(nile.tolist())
tm.assert_numpy_array_equal(res, exp.values)
10 changes: 10 additions & 0 deletions pydetect/tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,17 @@ class TestDatasets(tm.TestCase):
def test_nile(self):
nile = pydetect.datasets.get_nile()
self.assertIsInstance(nile, pd.Series)
self.assertEqual(nile.name, 'Nile')

# do not overwrite
new = pydetect.datasets.get_nile()
self.assertIsNot(nile, new)

def test_airpassengers(self):
ap = pydetect.datasets.get_airpassengers()
self.assertIsInstance(ap, pd.Series)
self.assertEqual(ap.name, 'Air Passengers')

# do not overwrite
new = pydetect.datasets.get_airpassengers()
self.assertIsNot(ap, new)
30 changes: 30 additions & 0 deletions pydetect/tests/test_outlier.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/usr/bin/env python

import pydetect

import numpy as np
import pandas as pd
import pandas.util.testing as tm


Expand All @@ -23,3 +26,30 @@ def test_sample(self):
self.assertEqual(d._get_lambda(54, 7), 3.1032430776016966)
self.assertEqual(d._get_lambda(54, 8), 3.0944564470227012)
self.assertEqual(d._get_lambda(54, 9), 3.0854245712422848)

def test_detect(self):
data = [-0.25, 0.68, 0.94, 1.15, 1.20, 1.26, 1.26,
1.34, 1.38, 1.43, 1.49, 1.49, 1.55, 1.56,
1.58, 1.65, 1.69, 1.70, 1.76, 1.77, 1.81,
1.91, 1.94, 1.96, 1.99, 2.06, 2.09, 2.10,
2.14, 2.15, 2.23, 2.24, 2.26, 2.35, 2.37,
2.40, 2.47, 2.54, 2.62, 2.64, 2.90, 2.92,
2.92, 2.93, 3.21, 3.26, 3.30, 3.59, 3.68,
4.30, 4.64, 5.34, 5.42, 6.01]
exp = np.array([False] * 51 + [True] * 3)

d = pydetect.GESDDetector(alpha=0.05)
res = d.detect(data)
tm.assert_numpy_array_equal(res, exp)

def test_decompose(self):
ap = pydetect.datasets.get_airpassengers()
ap[50] += 100

d = pydetect.GESDDetector(alpha=0.05, decompose=True)
res = d.detect(ap)

exp = np.zeros(144)
exp[50] = 1.0
exp = pd.Series(exp, index=ap.index)
tm.assert_series_equal(res, exp)

0 comments on commit 74d8150

Please sign in to comment.