In [None]:
import datetime
import os
try:
    from tqdm import notebook as tqdm
except ImportError:
    tqdm = None
    
import pandas as pd
import tensorflow as tf
import numpy as np
import ruptures
from metrics import find_change_indices

%matplotlib inline
%load_ext tensorboard
%load_ext autoreload
%autoreload 2

import dataset_ops

pd.set_option('display.max_columns', None)  # show all columns
GPUs = tf.config.list_physical_devices('GPU')
if GPUs is None or len(GPUs) == 0:
    print("WARNING: No GPU, all there is is:")
    for device in tf.config.list_physical_devices():
        print(f'- {device}')
else:
    for gpu in GPUs:
        tf.config.experimental.set_memory_growth(gpu, True)
        print("Initialized", gpu)

dataset_manager = dataset_ops.TestsManager(dataset_dir='./h5', runs_filename='runs.hdf')
all_runs = dataset_manager.get_all_available_tests()
selected_runs = all_runs.loc[(all_runs['Test Length'] > 200) & (all_runs['Test Length'] < 20000)]

inputs = ('SpeedFts', 'Pitch', 'Roll', 'Yaw', 'current_altitude', )
outputs= ('elev', 'ai', 'rdr', 'throttle', 'Flaps')

# max_length = selected_runs['Test Length'].max()
# max_length = 18000 

# dataset_manager.preload_data(selected_runs, max_length=max_length, features=inputs + outputs)
# tfdataset = dataset_ops.TensorflowDataset(dataset_manager)
# dataset = tfdataset.get_dataset(selected_runs, batch_size=25, features=inputs+outputs, max_length=max_length)

dataset = dataset_manager.preload_data(selected_runs, features=inputs+outputs)

augmented_dataset = []
for i, data in enumerate(dataset):
    l = data[2].to_numpy()
    ind = find_change_indices(tf.constant(l)).numpy()
    changes = np.concatenate((np.expand_dims(ind, 1), np.expand_dims(l[ind-1], 1), np.expand_dims(l[ind], 1)), 1)
    toappend = data[1], ind, changes
    
    augmented_dataset.append(toappend)

In [None]:
def precision_recall(changes_true, changes_pred, tolerance):
    if tf.shape(changes_true)[0] == 0 and tf.shape(changes_pred)[0] == 0:
        true_positive = 1
        false_positive = 0
    else:
        distances = tf.abs(
            tf.reshape(changes_pred, [-1, 1]) - 
            tf.reshape(changes_true, [1, -1])
        )  # axis0 = pred, axis1 = true
        false_positive = tf.reduce_sum(tf.cast(tf.reduce_min(distances, axis=1) > tolerance, 'int32'))
        true_positive = tf.reduce_sum(tf.cast(tf.reduce_min(distances, axis=1) <= tolerance, 'int32'))
    
    if tf.shape(changes_pred)[0] == 0:
        false_negatives = tf.cast(tf.shape(changes_true)[0], 'int32')
    else:
        min_distances = tf.reduce_min(tf.abs(
            tf.reshape(changes_pred, [-1, 1]) - 
            tf.reshape(changes_true, [1, -1])
        ), axis=0)

        false_negatives = tf.reduce_sum(tf.cast(min_distances > tolerance, 'int32'))
    
    precision = tf.math.divide_no_nan(tf.cast(true_positive, 'float32'), tf.cast(true_positive + false_positive, 'float32'))
    recall = tf.math.divide_no_nan(tf.cast(true_positive, 'float32'), tf.cast(true_positive + false_negatives, 'float32'))
    
    # print(changes_true, changes_pred)
    # print(true_positive, false_negatives, false_positive)
    # 
    # return tf.stack((precision, recall))
    return (precision, recall)

In [None]:
tau = [5, 15, 25]

results_header = []
for _tau in tau:
    for type in 'precision', 'recall':
        results_header.append(f'{type}_{_tau}')

session_start = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# session_start = '20200404-233041'
file_name = f'ruptures/{session_start}.csv'
# file_name = 'ruptures/all.csv'

if os.path.exists(file_name):
    results = pd.read_csv(file_name, index_col=0)
else:
    results = pd.DataFrame(columns=['method_name', 'model_type', 'penalty'] + results_header)

In [None]:
from joblib import Parallel, delayed

def evaluate(method_name, model_type, penalty, method_class, data=augmented_dataset):
    def map_func(datum):
        input, true_changes, _ = datum
        model = method_class(model=model_type)
        prediction = model.fit_predict(input.to_numpy(), pen=penalty)

        cum = []
        for _tau in tau:
            prec, recall = precision_recall(true_changes, prediction, _tau)
            cum += [float(prec),float(recall)]
        # print('Pred:', prediction, '\nTrue:', true_changes,'\nscore:', cum, '\n--')
        return tuple(cum)

    run_results = Parallel(n_jobs=8, prefer='threads')(delayed(map_func)(datum) for datum in data)
    # results = [map_func(datum) for datum in data]

    run_results = pd.DataFrame(run_results, columns=results_header)
    run_results['method_name'], run_results['model_type'], run_results['penalty'] = method_name, model_type, penalty

    print('Results mean:')
    print(run_results.mean(axis=0) * 100)
    
    return run_results


In [52]:
configurations = list()
# for model_type in ["l2", "l1", "rbf", "linear", "normal", "ar", "rank"]:
# for model_type in ["linear", "rank"]:
for model_type in ["normal"]:
    for method_name, method_class in (
        # ('window', ruptures.detection.Window),
        # ('bottomup', ruptures.BottomUp),
        ('exact', ruptures.Pelt)
    ):
        for penalty in (100, 500, 1000):
            configurations.append((
                method_name, 
                model_type, 
                penalty,
                method_class,
            ))

configurations

[('window', 'normal', 100, ruptures.detection.window.Window),
 ('window', 'normal', 500, ruptures.detection.window.Window),
 ('window', 'normal', 1000, ruptures.detection.window.Window),
 ('bottomup', 'normal', 100, ruptures.detection.bottomup.BottomUp),
 ('bottomup', 'normal', 500, ruptures.detection.bottomup.BottomUp),
 ('bottomup', 'normal', 1000, ruptures.detection.bottomup.BottomUp)]

In [53]:
for configuration in (tqdm.tqdm(configurations) if tqdm else configurations):
    try:
        print(configuration[:-1])
        run_results = evaluate(*configuration)
        results = results.append(run_results, ignore_index=True)
        results.to_csv(file_name)
    except:
        print('Failed')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [60]:
for _tau in tau:
    results[f'F1_{_tau}'] = 2 * (results[f'precision_{_tau}']*results[f'recall_{_tau}']) / (results[f'precision_{_tau}']+results[f'recall_{_tau}'])


F1_columns = [c for c in results.columns if 'F1' in c]
results.groupby(by=['method_name', 'model_type', 'penalty'])[F1_columns].aggregate('mean') * 100


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,F1_5,F1_15,F1_25
method_name,model_type,penalty,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bottomup,l1,100,14.294108,28.834491,39.016098
bottomup,l1,500,14.538393,30.525945,40.449982
bottomup,l1,1000,14.268346,30.76921,39.226578
bottomup,l2,100,11.917485,24.437654,33.808718
bottomup,l2,500,13.524666,27.415726,37.700742
bottomup,l2,1000,14.760537,28.981596,39.375014
bottomup,linear,100,47.829911,57.60061,60.329068
bottomup,linear,500,43.810534,56.277822,60.46532
bottomup,linear,1000,43.418841,55.956824,60.267122
bottomup,rank,100,23.905744,32.281255,37.338147


In [58]:
results.drop(columns=F1_columns).to_csv(file_name)