In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff

INPUT_DIR = "../Datasets/"

In [2]:
data = arff.loadarff(INPUT_DIR+"GunPoint/GunPoint_TRAIN.arff") 

In [3]:
df = pd.DataFrame(data[0])


In [15]:
def generate_candidates(data, max_len=3, min_len=2):
    candidates, l = [], max_len
    while l >= min_len:
        for entry in data:
            time_series, label = data.drop('target', axis=1), data['target']
            for index, time_serie in time_series.iterrows():
                for k in range(len(time_serie)-l+1):
                    candidates.append((time_serie[k:k+l].values, label.iloc[index]))
        l -= 1
    return pd.DataFrame(candidates)

In [16]:
candidates = generate_candidates(df)

In [17]:
def check_candidate(data, shapelet):
    histogram = {} 
    for entry in data:
        # TODO: entropy pre-pruning in each iteration
        time_serie, label = entry[0], entry[1]
        d, idx = subsequence_dist(time_serie, shapelet)
        if d is not None:
            histogram[d] = [(time_serie, label)] if d not in histogram else histogram[d].append((time_serie, label))
    return find_best_split_point(histogram)


def calculate_dict_entropy(data):
    counts = {}
    for entry in data:
        if entry[1] in counts: counts[entry[1]] += 1
        else: counts[entry[1]] = 1
    return calculate_entropy(np.divide(list(counts.values()), float(sum(list(counts.values())))))


def find_best_split_point(histogram):
    histogram_values = list(itertools.chain.from_iterable(list(histogram.values())))
    prior_entropy = calculate_dict_entropy(histogram_values)
    best_distance, max_ig = 0, 0
    best_left, best_right = None, None
    for distance in histogram:
        data_left = []
        data_right = []
        for distance2 in histogram:
            if distance2 <= distance: data_left.extend(histogram[distance2])
            else: data_right.extend(histogram[distance2])
        ig = prior_entropy - (float(len(data_left))/float(len(histogram_values))*calculate_dict_entropy(data_left) + \
             float(len(data_right))/float(len(histogram_values)) * calculate_dict_entropy(data_right))
        if ig > max_ig: best_distance, max_ig, best_left, best_right = distance, ig, data_left, data_right
    return max_ig, best_distance, best_left, best_right


def manhattan_distance(a, b, min_dist=float('inf')):
    dist = 0
    for x, y in zip(a, b):
        dist += np.abs(float(x)-float(y))
        if dist >= min_dist: return None
    return dist

def calculate_entropy(probabilities):
    return sum([-prob * np.log(prob)/np.log(2) if prob != 0 else 0 for prob in probabilities])


def subsequence_dist(time_serie, sub_serie):
    if len(sub_serie) < len(time_serie):
        min_dist, min_idx = float("inf"), 0
        for i in range(len(time_serie)-len(sub_serie)+1):
            dist = manhattan_distance(sub_serie, time_serie[i:i+len(sub_serie)], min_dist)
            if dist is not None and dist < min_dist: min_dist, min_idx = dist, i
        return min_dist, min_idx
    else:
        return None, None

Unnamed: 0,0,1
0,"[-0.6478854, -0.64199155, -0.63818632]",b'2'
1,"[-0.64199155, -0.63818632, -0.63825875]",b'2'
2,"[-0.63818632, -0.63825875, -0.63834515]",b'2'
3,"[-0.63825875, -0.63834515, -0.63869741]",b'2'
4,"[-0.63834515, -0.63869741, -0.64304876]",b'2'
5,"[-0.63869741, -0.64304876, -0.64376789]",b'2'
6,"[-0.64304876, -0.64376789, -0.64504991]",b'2'
7,"[-0.64376789, -0.64504991, -0.64711823]",b'2'
8,"[-0.64504991, -0.64711823, -0.64915334]",b'2'
9,"[-0.64711823, -0.64915334, -0.65124584]",b'2'
