#Данный модуль выделяет признаки входного временного ряда. Производит аугментацию, и сохраняет для каждого измерения связанный с ним набор признаков.

Внимание! Работает только при наличии графического процессора и CUDO


In [None]:
pip install tsfresh

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tsfresh
  Downloading tsfresh-0.19.0-py2.py3-none-any.whl (97 kB)
[K     |████████████████████████████████| 97 kB 1.8 MB/s 
Collecting stumpy>=1.7.2
  Downloading stumpy-1.11.1-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 67.8 MB/s 
[?25hCollecting statsmodels>=0.13
  Downloading statsmodels-0.13.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 69.3 MB/s 
Collecting matrixprofile<2.0.0,>=1.1.10
  Downloading matrixprofile-1.1.10-cp37-cp37m-manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 58.6 MB/s 
Collecting protobuf==3.11.2
  Downloading protobuf-3.11.2-cp37-cp37m-manylinux1_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 51.0 MB/s 
Installing collected packages: protobuf, stumpy, statsmodels, matrixprofile, tsfresh
  

In [None]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import settings

# для построения моделей воспользуемся sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Модуль предполагает запуск на разных устройствах. Этот блок кода как раз 
# определяет, откуда он вызван.

Colab = True
try:
    from google.colab import drive
except:
    Colab = False

if Colab:
    from google.colab import drive

    # Подключаем Google drive
    drive.mount('/content/drive')
    CrPath = "/content/drive/MyDrive/Uinnopolis/"

    import sys
    sys.path.append('/content/drive/MyDrive/Uinnopolis')
else:
    Acer = not os.path.exists("E:/Uinnopolis/")
    CrPath = "C:/Uinnopolis/" if Acer else "E:/Uinnopolis/"

from Libs import * #ReadCsv, WriteCsv, Graphic, Filter
from Experiments import *
from NN import *

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

random.seed(1)

print('Reading Dataset')
X, Y = ReadCsv(CrPath, DelZeros=True, SortCtg=False, Train=True, RetPrc=True)
X0, Y = ReadCsv(CrPath, DelZeros=True, SortCtg=False, Train=True, RetPrc=True, PostProc=True)
X_Test, y_test = ReadCsv(CrPath, DelZeros=True, SortCtg=False, Train=False, RetPrc=True, PostProc=True)
X_Valid, y_valid = X0[:100], Y[:100]
X_Train, y_train = X0[100:], Y[100:]
X_Valid, y_valid = TimeAugmentation(X_Valid, y_valid[:, 1:2], K=20, random=22, LevelK=0.15, UseMain=True)
X_Train, y_train = TimeAugmentation(X_Train, y_train[:, 1:2], K=10, random=22, LevelK=0.15, UseMain=True)

Mounted at /content/drive
Reading Dataset


Метод определяет признаки набора данных. Если указан FileName, сохранит их в файл


In [None]:
def GetFeatures(X, FileName = None):
    CrComprehensive = settings.ComprehensiveFCParameters()

    data_long = pd.DataFrame({0: X.flatten(), 1: np.arange(X.shape[0]).repeat(X.shape[1])})
    
    X = extract_features(data_long, column_id=1, impute_function=impute, default_fc_parameters=CrComprehensive)
    if FileName is not None:
        np.save(FileName, X)

    return X

Вместе с набором признаков сохраняем и сам исходный набор данных - он аугментирован случайными данными. В принципе, восстановить его на месте использования можно. Но есть риск ошибиться, и получить набор, отличный от исходного. Обнаружить такую ошибку будет трудно. Поэтому лучше заранее сохранить набор


Тренировочный набор велик, и чтобы протолкнуть его через бесплаатный Colab разбиваем его на 2 части. Тут обрабатываем первую из них

In [None]:
GetFeatures(X_Train, '/content/drive/MyDrive/Ftr_train_100_1.npy')
np.save('/content/drive/MyDrive/Train_100_1.npy', X_Train)
GetFeatures(X_Valid, '/content/drive/MyDrive/Ftr_valid_100.npy')
np.save('/content/drive/MyDrive/valid_100_X.npy', X_Valid)
GetFeatures(X_Test, '/content/drive/MyDrive/Ftr_test.npy')

Feature Extraction: 100%|██████████| 47300/47300 [1:11:01<00:00, 11.10it/s]
Feature Extraction: 100%|██████████| 2000/2000 [02:59<00:00, 11.13it/s]
Feature Extraction: 100%|██████████| 2071/2071 [03:03<00:00, 11.29it/s]


Unnamed: 0,0__variance_larger_than_standard_deviation,0__has_duplicate_max,0__has_duplicate_min,0__has_duplicate,0__sum_values,0__abs_energy,0__mean_abs_change,0__mean_change,0__mean_second_derivative_central,0__median,...,0__permutation_entropy__dimension_6__tau_1,0__permutation_entropy__dimension_7__tau_1,0__query_similarity_count__query_None__threshold_0.0,"0__matrix_profile__feature_""min""__threshold_0.98","0__matrix_profile__feature_""max""__threshold_0.98","0__matrix_profile__feature_""mean""__threshold_0.98","0__matrix_profile__feature_""median""__threshold_0.98","0__matrix_profile__feature_""25""__threshold_0.98","0__matrix_profile__feature_""75""__threshold_0.98",0__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,0.0,0.0,22.695787,18.625582,0.272091,-0.000213,0.000309,0.100591,...,4.759444,4.869503,0.0,1.139263,2.812087,2.119642,2.189682,1.850100,2.426522,0.832751
1,0.0,0.0,0.0,0.0,21.947912,16.623775,0.243792,0.001016,-0.000371,0.108175,...,4.742658,4.879926,0.0,0.977513,2.890190,2.073293,2.111933,1.836235,2.352109,0.862800
2,0.0,0.0,0.0,0.0,20.939528,16.742744,0.309033,0.001309,-0.000341,0.112645,...,4.753003,4.859079,0.0,0.977513,2.890190,2.073293,2.111933,1.836235,2.352109,0.747117
3,0.0,0.0,0.0,0.0,20.473826,17.877162,0.285551,0.002121,-0.001293,0.076965,...,4.763349,4.848656,0.0,0.977513,2.890190,2.073293,2.111933,1.836235,2.352109,0.805151
4,0.0,0.0,0.0,0.0,19.120524,18.802000,0.358416,0.000625,-0.001883,0.085712,...,4.773694,4.869503,0.0,0.977513,2.890190,2.073293,2.111933,1.836235,2.352109,0.814062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2066,0.0,0.0,0.0,0.0,20.291925,15.661926,0.243660,0.001402,-0.001384,0.094177,...,4.780135,4.869503,0.0,0.977513,2.890190,2.073293,2.111933,1.836235,2.352109,0.779430
2067,0.0,0.0,0.0,0.0,22.869233,21.950749,0.402449,0.005721,-0.000701,0.119901,...,4.742658,4.869503,0.0,0.977513,2.890190,2.073293,2.111933,1.836235,2.352109,0.760852
2068,0.0,0.0,0.0,0.0,22.298377,16.609165,0.260880,0.000234,-0.000386,0.107310,...,4.732312,4.848656,0.0,0.977513,2.890190,2.073293,2.111933,1.836235,2.352109,0.834018
2069,0.0,0.0,0.0,0.0,23.337640,16.495882,0.199342,-0.000490,0.000390,0.110293,...,4.794385,4.890349,0.0,1.324225,3.472211,2.576634,2.651479,2.327249,2.862273,0.793909


Вторая часть тренировочного набора данных

In [None]:
GetFeatures(X_Train, '/content/drive/MyDrive/Ftr_train_100_2.npy')
np.save('/content/drive/MyDrive/Train_100_2.npy', X_Train)

Этот блок не используется в данной разработке. Но на всякий случай я его сохранил - он статистическими методами отбрасывает лишние с его точки зрения набор признаков. В нашей задаче сработало плохо.

In [None]:
random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2)

relevant_features = set()

for label in range(7):
    # select_features работает с бинарной классификацией, поэтому переводим задачу
    # в бинарную для каждого класса и повторяем по всем классам
    y_train_binary = y_train[:,1] == label
    DF = pd.DataFrame(X_train)
    X_train_filtered = select_features(DF, y_train_binary)
    relevant_features = relevant_features.union(set(X_train_filtered.columns))

len(relevant_features)
np.save('/content/drive/MyDrive/Xfeatures.npy', relevant_features)