In [17]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from gtda.pipeline import Pipeline
from gtda.time_series import Resampler
from gtda.diagrams import PersistenceEntropy, Scaler, HeatKernel, BettiCurve
import numpy as np
from gtda.time_series import SingleTakensEmbedding, takens_embedding_optimal_parameters, TakensEmbedding
from sklearn.decomposition import PCA
from gtda.plotting import plot_point_cloud
from gtda.homology import VietorisRipsPersistence
from gtda.diagrams import PersistenceImage

from gtda.metaestimators import CollectionTransformer

In [14]:
def read_files_bundle(path, label, limit):
    data = []
    os.chdir(path)
    i = 0
    for file in os.listdir():
        if i < 50 :
            data_df = pd.read_csv(file)
            # fill NaN with an interpolated value
            data_df = data_df.interpolate()
            period = int(len(data_df)/limit)
            periodicSampler = Resampler(period=period)
            #print(i,file)
            # resample the files to that all of them are the same length (in entries)
            # NOTE: timestamps are omitted and timesteps are going to be different for each resampled time series!
            index_sampled, signal_sampled = periodicSampler.fit_transform_resample(data_df.index, data_df[label])
            data.append(signal_sampled)
            i += 1
    data_T = list(map(list, zip(*data)))
    data = np.array(data_T)
    df = pd.DataFrame.from_records(data_T)
    return data.T, df

In [15]:
data, normal_df = read_files("/Users/simo/repos/RareEventsDataset/3w_dataset-master/data/data/0/", "P-TPT",2000)

0 WELL-00002_20170625220127.csv
1 WELL-00008_20170611080445.csv
2 WELL-00002_20170810020026.csv
3 WELL-00007_20170801180000.csv
4 WELL-00002_20170612110022.csv
5 WELL-00001_20170527160000.csv
6 WELL-00001_20170527060000.csv
7 WELL-00005_20170812170000.csv
8 WELL-00008_20170703060123.csv
9 WELL-00006_20170828140031.csv
10 WELL-00006_20170209000114.csv
11 WELL-00008_20170612140047.csv
12 WELL-00005_20170814200000.csv
13 WELL-00002_20170623150127.csv
14 WELL-00002_20170620140116.csv
15 WELL-00002_20170613070011.csv
16 WELL-00002_20170621180032.csv
17 WELL-00005_20170815050000.csv
18 WELL-00002_20170621030054.csv
19 WELL-00006_20170207080124.csv
20 WELL-00002_20170218210134.csv
21 WELL-00002_20170214220322.csv
22 WELL-00006_20170508090031.csv
23 WELL-00002_20170621080422.csv
24 WELL-00002_20170625170232.csv
25 WELL-00008_20170612190111.csv
26 WELL-00001_20170219070031.csv
27 WELL-00002_20170210230203.csv
28 WELL-00008_20170702050135.csv
29 WELL-00002_20170804210000.csv
30 WELL-00005_201708

In [16]:
PatoBar = 1/100000

normal_df = normal_df.apply(lambda x: x*PatoBar)
normal_df 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,86.98015,139.8743,86.48771,130.3668,80.55306,141.8415,142.2115,209.3246,133.8594,211.4655,...,76.44843,177.2912,133.1498,180.7431,211.9853,133.3783,211.9446,211.4655,211.4655,208.8075
1,86.98015,139.8735,86.48351,130.3665,80.53976,141.8415,142.2204,209.2863,133.8631,211.4655,...,76.44178,177.2661,133.1558,180.7431,211.9897,133.3808,211.9484,211.4655,211.4655,208.8650
2,86.98015,139.8728,86.47931,130.3663,80.52646,141.8415,142.2293,209.3093,133.8669,211.4655,...,76.43513,177.2410,133.1633,180.7431,211.9942,133.3833,211.9521,211.4655,211.4655,208.8018
3,86.98015,139.8721,86.47511,130.3661,80.51316,141.8415,142.2382,209.3017,133.8626,211.4655,...,76.42848,177.2222,133.1707,180.7431,211.9986,133.3858,211.9559,211.4655,211.4655,208.8343
4,86.98015,139.8713,86.47091,130.3659,80.49986,141.8415,142.2405,209.2787,133.8571,211.4655,...,76.42183,177.2034,133.1782,180.7431,212.0030,133.3883,211.9597,211.4655,211.4655,208.8305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2068,85.25784,139.8385,85.46398,130.1718,86.29521,142.0410,141.8478,209.0488,133.8275,211.4655,...,86.18216,177.2557,133.1427,180.7431,212.0241,133.3643,212.0228,211.5949,211.4655,208.8650
2069,85.28443,139.8450,85.45068,130.1739,86.28989,142.0410,141.8348,209.1121,133.8280,211.4655,...,86.18216,177.2369,133.1464,180.7431,211.9975,133.3683,212.0317,211.5990,211.4655,208.8650
2070,85.31103,139.8552,85.43738,130.1759,86.28457,142.0410,141.8259,209.1178,133.8355,211.4655,...,86.18216,177.2181,133.1502,180.7431,211.9709,133.3723,212.0406,211.6031,211.4655,208.8688
2071,85.33763,139.8654,85.42408,130.1780,86.27925,142.0410,141.8170,209.0986,133.8430,211.4655,...,86.18216,177.2619,133.1525,180.7431,211.9443,133.3763,212.0495,211.6072,211.4655,208.8880


In [19]:
embedding_dimension = 11
embedding_time_delay = 70
stride = 2

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)
batch_pca =  CollectionTransformer(PCA(n_components=3),n_jobs=-1)
persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=None) 
image = PersistenceImage()

steps = [
         ("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("image", image)
        ]
topological_transfomer = Pipeline(steps)

In [21]:
Per_images_normal = topological_transfomer.fit_transform(normal_df.T)

In [22]:
Per_images_normal.shape

(51, 2, 100, 100)

In [35]:
image.plot(Xt=Per_images_normal, sample=0, homology_dimension_idx=1, colorscale='blues', plotly_params=None)

#(Per_images_normal, y=None, sample=1)#, homology_dimension_idx = )