In [12]:
import numpy as np
import pandas as pd
#import random
import time
import matplotlib.pyplot as plt
plt.style.reload_library()

from sktime.classification.interval_based import RandomIntervalSpectralEnsemble
from sktime.classification.dictionary_based import ContractableBOSS, BOSSEnsemble 
from sktime.datatypes._panel._convert import from_2d_array_to_nested


from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeClassifierCV

from wildboar.datasets import list_datasets, load_dataset
from wildboar.ensemble import ShapeletForestClassifier

import pywt
from pywt import wavedec, waverec

from tqdm.notebook import tqdm

import pickle

# UCR time series datasets: https://www.cs.ucr.edu/%7Eeamonn/time_series_data_2018/

# UCR time series datasets: https://www.cs.ucr.edu/%7Eeamonn/time_series_data_2018/

In [6]:
UCR_datasets = list_datasets(repository='wildboar/ucr')

In [7]:
dataset_info = pd.DataFrame(columns=['size', 'classes', 'length'], index = UCR_datasets, dtype=float)

for dataset in tqdm(UCR_datasets):
    x_all, y_all = load_dataset(dataset, repository='wildboar/ucr')
    
    # remove rows wirandomth missing values
    x = x_all[~np.isnan(x_all).any(axis=1)]
    y = y_all[~np.isnan(x_all).any(axis=1)]
    
    classes = np.unique(y) # all class labels
    total_examples, ts_length = x.shape
    
    dataset_info.loc[dataset] = [total_examples, len(classes), ts_length]

## Drop rows that have a value of zero in a column
dataset_info = dataset_info.loc[~(dataset_info==0).any(axis=1)]
dataset_info = dataset_info.loc[~(dataset_info==1).any(axis=1)]

dataset_info.to_pickle('data/datasets_information.pkl')

  0%|          | 0/128 [00:00<?, ?it/s]

In [17]:
dataset_info = pd.read_pickle('data/datasets_information.pkl')

In [18]:
dataset_info

Unnamed: 0,size,classes,length
ACSF1,200.0,10.0,1460.0
Adiac,781.0,37.0,176.0
ArrowHead,211.0,3.0,251.0
BME,180.0,3.0,128.0
Beef,60.0,5.0,470.0
...,...,...,...
Wine,111.0,2.0,234.0
WordSynonyms,905.0,25.0,270.0
Worms,258.0,5.0,900.0
WormsTwoClass,258.0,2.0,900.0


In [19]:
dataset_info.describe()

Unnamed: 0,size,classes,length
count,117.0,117.0,117.0
mean,1581.57265,8.264957,537.102564
std,2996.395773,12.256872,583.137433
min,40.0,2.0,15.0
25%,258.0,2.0,136.0
50%,724.0,3.0,301.0
75%,1272.0,7.0,720.0
max,24000.0,60.0,2844.0


In [20]:
dataset_info.quantile(.70)

size       1035.2
classes       6.2
length      571.4
Name: 0.7, dtype: float64

In [8]:
# select subset of all datasets
lb_size = 1
up_size = 200
lb_length = 10
up_length = 1000
lb_class = 2
up_class = 8

dataset_info = pd.read_pickle('data/datasets_information.pkl')

selected = dataset_info.loc[(dataset_info['size'] >= lb_size) & \
                       (dataset_info['size'] <= up_size) & \
                        (dataset_info['classes'] >= lb_class) & \
                        (dataset_info['classes'] <= up_class) & \
                       (dataset_info['length'] >= lb_length) & \
                       (dataset_info['length'] <= up_length)]



# condition = (dataset_info['classes'] <= classes_ub) & (dataset_info['classes'] >= classes_lb) 
# selected = dataset_info.loc[condition]


In [9]:
from wildboar.explain.counterfactual import counterfactuals

In [10]:
import plotly.express as px
df = px.data.stocks()


In [13]:
selected_datasets = selected.index.to_list()

index_dataset = 0

dataset = selected_datasets[index_dataset]

x_all, y_all = load_dataset(dataset, repository='wildboar/ucr')

# remove rows wirandomth missing values
x = x_all[~np.isnan(x_all).any(axis=1)]
y = y_all[~np.isnan(x_all).any(axis=1)]

classes = np.unique(y) # all class labels
total_examples, ts_length = x.shape

x_ind = np.arange(total_examples)

x_train_ind, x_test_ind, y_train, y_test = train_test_split(x_ind, y, test_size=.30, random_state=0, shuffle=True, stratify=None)

x_train = x[x_train_ind,:]
x_test = x[x_test_ind,:]

y_train[y_train != 1.0] = -1.0
y_test[y_test != 1.0] = -1.0

clf_kNN = KNeighborsClassifier(metric="euclidean")
clf_kNN.fit(x_train, y_train)

#clf_RSF = ShapeletForestClassifier(n_estimators=50, metric='scaled_euclidean')
clf_RSF = ShapeletForestClassifier(
        n_estimators=20, 
        metric='euclidean', 
        max_depth=5, 
        max_shapelet_size=.4, # INTERACTION: Make this as input from user
        random_state=1,
    )
clf_RSF.n_features_in_ = x_train.shape[-1]
clf_RSF.fit(x_train, y_train)


x_counterfactual_RSF, x_valid_RSF, x_score_RSF = counterfactuals(
    clf_RSF, 
    x_test, 
    -y_test, # invert the classes, i.e., transform 1 -> -1 and -1 -> 1
    scoring="euclidean",
    valid_scoring=False,
    random_state=2,
    epsilon=1,
  )
x_counterfactual_kNN, x_valid_kNN, x_score_kNN = counterfactuals(
    clf_kNN, 
    x_test, 
    -y_test, # invert the classes, i.e., transform 1 -> -1 and -1 -> 1
    scoring="euclidean",
    valid_scoring=False,
  )


In [74]:
df = pd.DataFrame(dict(
    length = np.arange(y1.shape[0]),
    x_counter_RSF = y1,
    x_counter_kNN = y2,
    x_test = y3
))

In [76]:
display(df)
fig = px.line(df, x=df["length"], y = df.columns)
fig.update_layout(template="plotly_dark")
fig.show()

Unnamed: 0,length,x_counter_RSF,x_counter_kNN,x_test
0,0,0.023094,0.021178,0.023094
1,1,0.039898,0.023945,0.039898
2,2,0.011821,0.032206,0.011821
3,3,0.002965,0.020446,0.002965
4,4,-0.000747,0.031415,-0.000747
...,...,...,...,...
123,123,0.384471,0.036203,0.032108
124,124,0.389368,0.017755,0.036226
125,125,0.386318,0.025897,0.030195
126,126,0.377019,0.028528,0.017082


In [69]:
sel_instance_idx = 5
x_counter_RSF = x_counterfactual_RSF[sel_instance_idx, :]
x_counter_kNN = x_counterfactual_kNN[sel_instance_idx, :]
y1 = x_counterfactual_RSF[sel_instance_idx, :]
y2 = x_counterfactual_kNN[sel_instance_idx, :]
y3 = x_test[sel_instance_idx, :]
X = np.array([y1, y2, y3])
X = X.T
cols = ["x_counter_RSF","x_counter_kNN","x_test"]
df=pd.DataFrame(X, columns=cols)
df = df.T
display(df)
# df.columns = df.iloc[-1]
# print(df)
# df = df.head(-1)

# df.index.name = 'Country'

# Want time on the x-axis? ###
# just include:
# df = df.T
##############################

# plotly
fig = px.line(df, x=df.index, y = df.columns)
fig.update_layout(template="plotly_dark")
fig.show()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
x_counter_RSF,0.023094,0.039898,0.011821,0.002965,-0.000747,0.023448,0.009273,0.123,0.455,0.792,...,0.376414,0.373038,0.364174,0.369213,0.378946,0.384471,0.389368,0.386318,0.377019,0.374531
x_counter_kNN,0.021178,0.023945,0.032206,0.020446,0.031415,0.012058,0.031515,0.01636,0.013733,0.032101,...,0.2845,0.13275,0.077906,0.056329,0.036121,0.036203,0.017755,0.025897,0.028528,0.016795
x_test,0.023094,0.039898,0.011821,0.002965,-0.000747,0.023448,0.009273,0.123,0.455,0.792,...,0.023289,0.017181,0.003925,0.01083,0.025278,0.032108,0.036226,0.030195,0.017082,0.014414



DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead.  To get a de-fragmented frame, use `newframe = frame.copy()`



In [55]:
df

x_test,0.023094,0.039898,0.011821,0.002965,-0.000747,0.023448,0.009273,0.123000,0.455000,0.792000,...,0.023289,0.017181,0.003925,0.010830,0.025278,0.032108,0.036226,0.030195,0.017082,0.014414
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
x_counter_RSF,0.023094,0.039898,0.011821,0.002965,-0.000747,0.023448,0.009273,0.123,0.455,0.792,...,0.376414,0.373038,0.364174,0.369213,0.378946,0.384471,0.389368,0.386318,0.377019,0.374531
x_counter_kNN,0.021178,0.023945,0.032206,0.020446,0.031415,0.012058,0.031515,0.01636,0.013733,0.032101,...,0.2845,0.13275,0.077906,0.056329,0.036121,0.036203,0.017755,0.025897,0.028528,0.016795


In [38]:
X = X.T

In [39]:
X.shape

(128, 3)

In [63]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

import plotly.io as pio

# sample dataframe of a wide format
np.random.seed(5); cols = ['Canada', 'France', 'Germany']
X = np.random.randn(6,len(cols))  
df=pd.DataFrame(X, columns=cols)
df.iloc[0]=0;df=df.cumsum()
df['Year'] =  pd.date_range('2020', freq='Y', periods=len(df)).year.astype(str)
df = df.T
df.columns = df.iloc[-1]
df = df.head(-1)
df.index.name = 'Country'

# Want time on the x-axis? ###
# just include:
# df = df.T
##############################

# plotly
fig = px.line(df, x=df.index, y = df.columns)
fig.update_layout(template="plotly_dark")



In [65]:
df

Year,2020,2021,2022,2023,2024,2025
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Canada,0.0,-0.252092,-1.161325,-1.491194,-1.850023,-2.550202
France,0.0,0.10961,-0.482027,-1.674791,-1.07132,0.080071
Germany,0.0,1.582481,1.770084,1.565208,-0.099581,1.75775
