Skip to content

Commit

Permalink
initial implementation of SAX for TimeSeries objects
Browse files Browse the repository at this point in the history
  • Loading branch information
stefan-pdx committed Apr 10, 2011
1 parent cea7703 commit be33b5c
Showing 1 changed file with 86 additions and 0 deletions.
86 changes: 86 additions & 0 deletions nitime/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#-----------------------------------------------------------------------------

import numpy as np
from scipy.stats import norm
import string

# Our own
from nitime import descriptors as desc
Expand Down Expand Up @@ -1111,6 +1113,90 @@ def during(self,e):
def shape(self):
return self.data.shape

def sax(self, symbol_count=5, phrase_length=10):
""" Return the SAX representation of the timeseries data
For a given multi-variate TimeSeries object, we can represent the data in
symbolic form, making it considerably easier to perform indexing,
clustering, classification, etc. In this approach, the data is normalized
under the assumption that it forms a Gaussian distribution about the mean,
where we compute the Piecewise Aggregrate Approximation (PAA). From that
point on, we characterize those given segments by a character determined
by where the mean value of that segment falls in with the global mean.
Parameters
----------
symbol_count : integer
The number of characters to use in the representation. Characters will
be selected in order from [a-z][A-Z].
phrase_length : integer
The overall length of the SAX representation.
Returns
-------
A list containing the SAX representational string for each component
in the TimeSeries data collection
Examples
--------
The SAX representation of sine and cosine waves
>>> t1 = ts.TimeSeries([np.cos(np.linspace(0, 2*np.pi, 500)),
... np.sin(np.linspace(0, 2*np.pi, 500))], sampling_rate=1)
>>> t1.sax()
['edcbaabcde', 'deeedbaaab']
>>> t1.sax(symbol_count=8)
['hgebaabegh', 'fghgfcbabc']
>>> t1.sax(symbol_count=8, phrase_length=15)
['hggecbaaabceggh', 'eghhhgfdcbaaabd']
Resources
---------
Original Paper:
http://www.cs.ucr.edu/~eamonn/SAX.pdf
Credits:
Jessica Lin, Eamonn Keogh, Stefano Lonardi, Bill Chiu
"""
# First, we're going to need to cache off a copy of self's data collection
# so that we can properly normalize it.
normalized_data = np.copy(self.data)

# Shift the data so that the mean is zero. Note that we're calculating the
# mean for each data subset so that we can output a SAX representation
# for each dimension.
normalized_data -= np.split(np.mean(normalized_data, axis=1), normalized_data.shape[0])

# Scale the data so that it has a standard deviation of 1.
normalized_data *= np.split(1.0/normalized_data.std(axis=1), normalized_data.shape[0])

# Calculate our breakpoint locations given symbol_count. We're going to
# concatenate Inf to our breakpoints to help in calculating our
# intervals.
breakpoints = norm.ppf(np.linspace(1./symbol_count, 1-1./symbol_count, symbol_count-1))
breakpoints = np.concatenate((breakpoints, np.array([np.Inf])))

# Split the data into phrase_length pieces.
normalized_data = np.array_split(normalized_data, phrase_length, axis=1)

# Calculate the mean for each section.
section_means = [np.mean(section, axis=1) for section in normalized_data]

# Figure out which break each section is in based on the section_means and
# calculated breakpoints.
section_locations = [[np.where(breakpoints > axis_mean)[0][0]
for axis_mean in section_mean] for section_mean in section_means]
section_locations = zip(*section_locations)

# Convert the location into the corresponding letter.
sax_phrases = [''.join([string.ascii_letters[ind] for ind in section_location])
for section_location in section_locations]

return sax_phrases

_epochtype = np.dtype({'names':['start','stop'],'formats':[np.int64]*2})
class Epochs():
"""Represents a time interval"""
Expand Down

0 comments on commit be33b5c

Please sign in to comment.