## Time-Series Random Forest

To do:

* split code into two files, one for each implementation approach (changing BaseDecisionTree or allowing for pipelines as base_estimators)
* clean files (only include objects, methods that had to be changed, inherit the rest)
* push to repo

In [2]:
from sktime.tsforest import TSRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sktime.load_data import load_from_web_to_xdataframe
import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

%load_ext line_profiler

%matplotlib inline

In [3]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

data = load_breast_cancer(return_X_y=False)

y = data.target
X = data.data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
# (pd.DataFrame({'feat_importance': clf.feature_importances_}, 
#               index=data.feature_names)
#  .sort_values('feat_importance', ascending=False)
#  .head())

0.965034965034965


In [5]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf.predict_proba(X_test)[:5,:]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.]])

## Implementing TimeSeriesRandomForestClassifier
generally, requires
* to remove input checks, as nested pandas/xpandas do not comply
* need to be replaced by new input checks, ideally implemented as part of data container, as input checking is more expensive 

more specifically, two ways to implement:
* change basic DecisionTreeClassifier class to include specific transforms in fit/predict methods
* change base_estimator to pipeline with transforms, requires additionally 
    * to change how parameters are set in construction of individual trees
    * add random state attribute to pipeline 
    * adapt some helper functions for parallel building and accumulating predictions of trees

In [6]:
cache_path = 'data/'
dataset_name = 'GunPoint'

X_train, y_train = load_from_web_to_xdataframe(dataset_name, is_train_file=True,
                                               cache_path=cache_path) 
X_test, y_test = load_from_web_to_xdataframe(dataset_name, is_test_file=True,
                                             cache_path=cache_path)
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)
X_test = pd.DataFrame(X_test)
y_test = pd.Series(y_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(50, 1) (50,) (150, 1) (150,)


In [7]:
# define features to be calculated for each time-series interval
def ts_slope(ts):
    """
    Compute slope of time series using linear regression
    """
    n = ts.shape[0]
    if n < 2:
        return 0
    else:
        x = np.arange(n) + 1
        y = np.asarray(ts)
        beta = ((x*y).mean() - x.mean() * y.mean()) / ((x**2).mean() - (x.mean())**2)
        return beta

In [9]:
from sktime.tsforest import TSDecisionTreeClassifier
tree = TSDecisionTreeClassifier(criterion='entropy')
%time tree.fit(X_train, y_train, check_input=True) 
tree.score(X_test, y_test)

CPU times: user 1.18 s, sys: 9.5 ms, total: 1.18 s
Wall time: 1.19 s


0.8666666666666667

In [11]:
from sktime.tsforest import TSRandomForestClassifier
forest = TSRandomForestClassifier(n_estimators=2, criterion='entropy')
%time forest.fit(X_train, y_train)

CPU times: user 2.38 s, sys: 11.6 ms, total: 2.39 s
Wall time: 2.38 s


TSRandomForestClassifier(bootstrap=True, class_weight=None,
             criterion='entropy', max_depth=None, max_features='auto',
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=2, n_jobs=None, oob_score=False,
             random_state=None, verbose=0, warm_start=False)

In [12]:
%time forest.score(X_test, y_test)

CPU times: user 6.57 s, sys: 23 ms, total: 6.59 s
Wall time: 6.59 s


0.9266666666666666

In [14]:
from sktime.tsforest import RandomIntervalSegmenter, FeatureExtractor, TimeSeriesRandomForest
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.tree import DecisionTreeClassifier

clf = TimeSeriesRandomForest(n_estimators=2)

%time clf.fit(X_train, y_train)

CPU times: user 2.05 s, sys: 3.86 ms, total: 2.06 s
Wall time: 2.06 s


TimeSeriesRandomForest(base_estimator=TSPipeline(memory=None,
      steps=[('segment', RandomIntervalSegmenter()), ('extract', FeatureExtractor(feature_calculators=[<function mean at 0x1066be1e0>, <function std at 0x1066be268>, <function TimeSeriesRandomForest.__init__.<locals>._ts_slope at 0x106545c80>])), ('clf', DecisionTreeClassifier(class_weight=None, criterion...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
            bootstrap=True, class_weight=None, criterion=None,
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=None, min_impurity_split=None,
            min_samples_leaf=None, min_samples_split=None,
            min_weight_fraction_leaf=None, n_estimators=2, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
%time clf.score(X_test, y_test)

CPU times: user 5.77 s, sys: 13.5 ms, total: 5.79 s
Wall time: 5.79 s


0.8933333333333333

# Time-series transformer + RandomForest

### Random Interval Transformer
Split data into random time intervals, series-to-series transformer

In [18]:
def random_interval(index):
    starts = []
    ends = []
    m = index.shape[0] # series length
    idx = np.arange(1, m + 1)

    def random_choice(x, size=None):
        return np.random.choice(x, replace=False, size=size)

    W = random_choice(idx, size=int(np.sqrt(m)))
    for w in W:
        size = m - w + 1
        start = random_choice(np.arange(1, size+1), size=int(np.sqrt(size))) - 1
        starts.extend(start)
        for s in start:
            end = s + w
            ends.append(end)
    return starts, ends

In [349]:
# test random_interval_sample function
n = 10_000
mins = np.empty(n)
maxs = np.empty(n)
index = np.arange(100)
for i in range(n):
    starts, ends = random_interval(index)
    assert all((np.array(e) - np.array(s)) > 0) # only non-empty intervals
    assert all([s < e for s, e in zip(starts, ends)])
    assert all([e <= index.shape[0] for e in ends]) # within given index
    assert all([s >= 0 for s in starts]) # within given index
    mins[i] = min(starts)
    maxs[i] = max(ends)
print(mins.min(), maxs.max())

0.0 100.0


In [350]:
# split series into random intervals
df = X_train
n_rows, n_cols = df.shape
ts_index = df.iloc[0,0].index
starts, ends = sample(ts_index)
intervals = np.column_stack([starts, ends])
n_intervals = intervals.shape[0]

interval_data_dict = {}
for col in range(n_cols):
    col_name = df.columns[col]
    for i, (start, end) in enumerate(intervals):
        interval_data_list = []
        for row in range(n_rows):
            interval_data = df.iloc[row, col].iloc[start:end]
            interval_data_list.append(interval_data)
        interval_data_dict[f'{col_name}_{start}_{end}'] = interval_data_list

dft = pd.DataFrame(interval_data_dict)

assert dft.shape == (n_rows, n_cols * n_intervals)
assert dft.replace([np.inf, -np.inf], np.nan).isna().sum().sum() == 0

### Compute features for each interval
Apply feature calculators to each cell, series-to-tabular transformer
* For pipelines, when using sklearn classifiers/regressors, check if penultimate estimator is series-to-tabular transformer

In [516]:
n_rows, n_cols = dft.shape
feature_calculators = [np.mean, np.std, ts_slope]
n_features = len(feature_calculators)

calculated_data_dict = {}
for i, calculator in enumerate(feature_calculators):
    calculated_data_list = []
    for col in range(n_cols):
        col_name = f'{dft.columns[col]}_{calculator.__name__}'
        calculated_data_dict[col_name] = dft.iloc[:, col].apply(calculator)
dfc = pd.DataFrame(calculated_data_dict)
assert dfc.shape == (n_rows, n_cols * n_features)
assert dfc.replace([np.inf, -np.inf], np.nan).isna().sum().sum() == 0

### Using transfomers

In [8]:
from sktime.ts_forest import RandomIntervalSegmenter, FeatureExtractor
from sklearn.ensemble import RandomForestClassifier

In [9]:
cache_path = "C:/temp/sktime_temp_data/"
dataset_name = "GunPoint"

X_train, y_train = load_from_web_to_xdataframe(dataset_name, 
                                               is_train_file=True, cache_path=cache_path) 
X_test, y_test = load_from_web_to_xdataframe(dataset_name,
                                             is_test_file=True, cache_path=cache_path)
X_train = pd.DataFrame(X_train)
y_train = pd.Series(y_train)
X_test = pd.DataFrame(X_test)
y_test = pd.Series(y_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(50, 1) (50,) (150, 1) (150,)


In [10]:
X_train.head()

Unnamed: 0,dim_0
0,0 -0.647885 1 -0.641992 2 -0.63818...
1,0 -0.644427 1 -0.645401 2 -0.64705...
2,0 -0.778353 1 -0.778279 2 -0.77715...
3,0 -0.750060 1 -0.748103 2 -0.74616...
4,0 -0.599539 1 -0.597422 2 -0.59926...


In [11]:
feature_calculators = [np.mean, np.std, ts_slope]

# pipeline
# set up transformers
segmenter = RandomIntervalSegmenter()
extractor = FeatureExtractor(feature_calculators=feature_calculators)

# fit-transform training data 
X_train_trans = segmenter.fit_transform(X_train)
X_train_trans = extractor.fit_transform(X_train_trans)

# transform test data using fitted transformers
X_test_trans = segmenter.transform(X_test)
X_test_trans = extractor.transform(X_test_trans)
print(X_train_trans.shape, X_test_trans.shape)

# train and score using random forest classifier
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train_trans, y_train)
clf.score(X_test_trans, y_test)

(50, 273) (150, 273)


0.94

#### Using sklearn pipeline

In [12]:
from sklearn.pipeline import Pipeline

In [17]:
steps = [
    ('segment', RandomIntervalSegmenter()), 
    ('extract', FeatureExtractor(feature_calculators=feature_calculators)),
    ('classify', RandomForestClassifier(n_estimators=200))
]
pipe = Pipeline(steps)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.96

In [23]:
def profile_func():
    steps = [
    ('segment', RandomIntervalSegmenter()), 
    ('extract', FeatureExtractor(feature_calculators=feature_calculators)),
    ('classify', RandomForestClassifier(n_estimators=200))
    ]
    pipe = Pipeline(steps)
    pipe.fit(X_train, y_train)
    pipe.predict(X_test)

In [481]:
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy='prior')
dummy.fit(X_train, y_train)
dummy.score(X_test, y_test)

0.49333333333333335

## pandas indexing does not accept series into single cells

* input checking for time-series data more expensive than for tabular data, makes more sense to outsource main checks to dataframe, with check results stored as attributes with can be checked if relevant by methods/estimators 

In [75]:
df = pd.DataFrame(np.zeros((3, 3)), dtype='object')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
0    3 non-null object
1    3 non-null object
2    3 non-null object
dtypes: object(3)
memory usage: 152.0+ bytes


In [79]:
# df.iloc[0,0] = pd.Series(np.random.normal(size=10)) # breaks 
df[0,0] = pd.Series(np.random.normal(size=10))