In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
data_root = '/media/scottcha/E1/Data/OAPMLData'

ml_path = data_root + '/5.MLData'
num_features = 978
interpolation = 1

In [3]:
from tsai import utils as tsai_utils
import numpy as np
import pandas as pd
from tsfresh import extract_relevant_features, extract_features

In [4]:
interesting_labels =  ['Day1DangerAboveTreeline',
                         'Day1DangerBelowTreeline',
                         'Day1DangerNearTreeline']

In [10]:
l = interesting_labels[0]
print('On: ' + l)
label = l
file_label = 'co_' + l + '_small'
fname = ml_path + '/X_all_' + file_label + '.npy'


        
X = np.load(fname, mmap_mode='r')
X = X[0:5000,:,:]

On: Day1DangerAboveTreeline


In [11]:
X.shape

(5000, 978, 180)

In [12]:
from pandas.api.types import CategoricalDtype
#from tsai.all import *
from joblib import Parallel, delayed
import os.path
import numpy as np

# Cell
class TSAIUtilities:
    def __init__(self, X, label):
        self.X = X
        self.num_features = X.shape[1]
        self.label = label

    def _calculate_feature_mean(self, feature_index, num_samples_to_use=5000):
        return np.nanmean(self.X[0:num_samples_to_use,feature_index,:])

    def _calculate_feature_std(self, feature_index, num_samples_to_use=5000):
        return np.nanstd(self.X[0:num_samples_to_use,feature_index,:])

    def get_feature_means(self, from_cache=None):
        if not os.path.isfile(from_cache):
            feature_means = Parallel(n_jobs=4)(map(delayed(self._calculate_feature_mean), range(0,self.num_features)))

            if from_cache is not None:
                np.save(from_cache, np.asarray(feature_means))
        else:
            feature_means = np.load(from_cache)

        return feature_means

    def get_feature_std(self, from_cache=None):
        if not os.path.isfile(from_cache):
            feature_std = Parallel(n_jobs=4)(map(delayed(self._calculate_feature_std), range(0,self.num_features)))

            if from_cache is not None:
                np.save(from_cache, np.asarray(feature_std))

        else:
            feature_std = np.load(from_cache)

        return feature_std

    def get_y_as_cat(self, y_df):
        #convert the labels to encoded values
        labels = y_df[self.label].unique()
        if 'Low' in labels:
            labels = ['Low', 'Moderate', 'Considerable', 'High']
        else:
            labels.sort()
        cat_type = CategoricalDtype(categories=labels, ordered=True)
        y_df[self.label + '_Cat'] = y_df[self.label].astype(cat_type)
        y = y_df[self.label + '_Cat'].cat.codes.values

        cat_dict = dict( enumerate(y_df[self.label + '_Cat'].cat.categories ) )
        return y, cat_dict


In [13]:
oap_utils = TSAIUtilities(X, label)

In [14]:
feature_means = oap_utils.get_feature_means('./feature_means.csv')
feature_std = oap_utils.get_feature_std('./feature_std.csv')

In [15]:
np.isnan(feature_std).any()

False

In [16]:
X.shape

(5000, 978, 180)

In [17]:
X_std = np.full_like(X, 0)

In [18]:
for i in range(X.shape[1]):
    X_tmp = np.nan_to_num(X[:,i,:], nan=feature_means[i])
    
    X_std[:,i,:] = (X_tmp - feature_means[i])/(feature_std[i])

  X_std[:,i,:] = (X_tmp - feature_means[i])/(feature_std[i])


In [19]:
feature_names = pd.read_csv(ml_path + '/FeatureLabels_co_Day1DangerAboveTreeline_small.csv').sort_values(['0'])

In [20]:
feature_names = feature_names['0'].reset_index(drop=True)

In [21]:
feature_names = feature_names.str.replace('__', '_')

In [22]:
feature_names = pd.concat([pd.Series(['id']), feature_names])

In [23]:
X_df = tsai_utils.to_tsfresh_df(X_std)

In [24]:
X_df.columns = feature_names

In [25]:
X_df.head()

Unnamed: 0,id,ABSV_1000mb_avg,ABSV_1000mb_max,ABSV_1000mb_min,ABSV_100mb_avg,ABSV_100mb_max,ABSV_100mb_min,ABSV_10mb_avg,ABSV_10mb_max,ABSV_10mb_min,...,VWSH_PV_EQ_M2eM06_Km_2_kg_s_surface_min,VWSH_tropopause_avg,VWSH_tropopause_max,VWSH_tropopause_min,WEASD_surface_avg,WEASD_surface_max,WEASD_surface_min,WILT_surface_avg,WILT_surface_max,WILT_surface_min
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
i = 0
y_train_df = pd.read_parquet(ml_path + '/y_train_batch_' + str(i) + '_' + file_label + '.parquet')  
y_test_df = pd.read_parquet(ml_path + '/y_test_batch_' + str(i) + '_' + file_label + '.parquet')  
y_df = pd.concat([y_train_df, y_test_df]).reset_index(drop=True)
y, cat_dict = oap_utils.get_y_as_cat(y_df)
print(cat_dict)

{0: 'Low', 1: 'Moderate', 2: 'Considerable', 3: 'High'}


In [27]:
y = y[:5000]

In [28]:
y.shape

(5000,)

In [29]:
y

array([1, 2, 2, ..., 1, 1, 0], dtype=int8)

In [30]:
(np.isnan(X_df).any()).any()

True

In [31]:
X_df.fillna(0, inplace=True)

In [32]:
len(X_df['id'].unique())

5000

In [33]:
len(y)

5000

In [16]:
#means_fn = ml_path + '/feature_means_interpolation' + str(interpolation) + '_' + file_label + 'x.npy'
#feature_means = utils.get_feature_means(from_cache=means_fn)

In [None]:
#fill_values = torch.zeros_like(TSTensor(X))
#for i in range(0,X.shape[1]):
#    fill_values[:,i,:] = torch.full_like(TSTensor(X[:,i,:]), feature_means[i])
        
#X_noNan = torch.where(torch.isnan(TSTensor(X)), fill_values, TSTensor(X))

In [None]:
#from tsfresh import extract_relevant_features

#features_filtered_direct = extract_relevant_features(X_df, pd.Series(y),
#                                                     column_id='id', n_jobs=62)

Feature Extraction:   0%|          | 0/310 [00:00<?, ?it/s]

In [19]:
#features_filtered_direct.to_csv(ml_path + '/tsfresh_features_0.csv')

In [25]:
#features_filtered_direct.head()

Unnamed: 0,16__last_location_of_minimum,16__first_location_of_minimum,"16__agg_linear_trend__attr_""slope""__chunk_len_10__f_agg_""min""",8__mean_change,"8__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.0",26__index_mass_quantile__q_0.2,"24__agg_linear_trend__attr_""slope""__chunk_len_10__f_agg_""min""","25__agg_linear_trend__attr_""slope""__chunk_len_10__f_agg_""min""",15__last_location_of_minimum,8__time_reversal_asymmetry_statistic__lag_1,...,14__ar_coefficient__coeff_4__k_10,"16__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0","4__fft_coefficient__attr_""real""__coeff_4","12__change_quantiles__f_agg_""mean""__isabs_False__qh_1.0__ql_0.8","10__cwt_coefficients__coeff_6__w_20__widths_(2, 5, 10, 20)",4__quantile__q_0.8,19__symmetry_looking__r_0.05,24__symmetry_looking__r_0.30000000000000004,0__has_duplicate,22__has_duplicate
0,0.533333,0.0,4.1e-05,3e-06,3e-06,0.633333,4.2e-05,4.4e-05,0.533333,1.519479e-14,...,0.966928,2e-06,-0.000218,-2e-06,5.4e-05,0.000115,0.0,0.0,1.0,1.0
1,0.466667,0.0,4.2e-05,4e-06,4e-06,0.6,3.9e-05,4.5e-05,0.466667,5.949558e-14,...,0.069985,2e-06,-2.2e-05,1.2e-05,8.7e-05,0.000124,0.0,1.0,1.0,1.0
2,0.433333,0.0,4.2e-05,3e-06,3e-06,0.566667,4.3e-05,4.5e-05,0.433333,3.779092e-14,...,0.705611,3e-06,-8e-06,9e-06,0.000108,0.000119,0.0,0.0,1.0,1.0
3,0.433333,0.0,4.5e-05,3e-06,3e-06,0.566667,4.3e-05,4.6e-05,0.433333,3.671881e-14,...,1.064174,2e-06,8e-06,2.6e-05,0.000112,0.000124,0.0,0.0,1.0,1.0
4,0.433333,0.0,4.1e-05,3e-06,3e-06,0.566667,4.1e-05,4.2e-05,0.433333,3.276297e-14,...,0.172475,6e-06,0.000125,3e-06,0.000106,0.000127,0.0,1.0,1.0,1.0


In [36]:
num_features = len(feature_names)-1
num_to_compute = 10

In [37]:
for i in range(421, 431, num_to_compute):
    print('On ' + str(i))
    cols = list(['id'])
    cols.extend(list(feature_names.iloc[i:i+num_to_compute].values))
    features_filtered_direct = extract_relevant_features(X_df.loc[:, cols], pd.Series(y),
                                                         column_id='id', 
                                                         profile=True, profiling_filename='./profile.txt', 
                                                         n_jobs=12)
    features_filtered_direct.to_csv(ml_path + '/tsfresh_features_' + str(i) + '_' + file_label + '.csv')

On 421


Feature Extraction: 100%|██████████| 60/60 [09:29<00:00,  9.48s/it]  


In [7]:
features_filtered_direct.head()

NameError: name 'features_filtered_direct' is not defined

In [28]:
ts_features_df = pd.read_csv(ml_path + '/tsfresh_features_1_co_Day1DangerAboveTreeline_small.csv', index_col=0)

In [30]:
import tsfresh
kind_to_fc_parameters = tsfresh.feature_extraction.settings.from_columns(ts_features_df)

In [39]:
len(list(kind_to_fc_parameters.keys()))

50

In [41]:
len(cols)

51

In [45]:
num_features = len(feature_names)-1
for i in range(1, 51, 50):
    print('On ' + str(i))
    cols = list(['id'])
    cols.extend(list(feature_names.iloc[i:i+50].values))
    features_filtered_direct = extract_features(X_df.loc[:, cols], 
                                                column_id='id', 
                                                kind_to_fc_parameters=kind_to_fc_parameters,
                                                n_jobs=14)


On 1


Feature Extraction: 100%|██████████| 70/70 [06:43<00:00,  5.77s/it] 


In [48]:
features_filtered_direct.to_csv(ml_path + '/tsfresh_features_1_co_Day1DangerAboveTreeline_small_test.csv')

In [None]:
extract_features(df, default_fc_parameters=settings)

In [None]:
num_features = len(feature_names)-1
for i in range(1, 901, 5):
    print('On ' + str(i))
    cols = list(['id'])
    cols.extend(list(feature_names.iloc[i:i+5].values))
    features_filtered_direct = extract_relevant_features(X_df.loc[:, cols], pd.Series(y),
                                                     column_id='id', n_jobs=14)
    features_filtered_direct.to_csv(ml_path + '/tsfresh_features_' + str(i) + '_' + file_label + '.csv')