In [1]:
import numpy as np
import pandas as pd 
from glob import glob
from os import path
import matplotlib.pyplot as plt
from torch.utils.data.dataset import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing
from torch.autograd import Variable
import torch.utils.data as utils
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
train_file = r'data/train/train.csv'
test_path = r'data/test'
test_files = glob(path.join(test_path, '*.csv'))

In [3]:
def create_numpy_files(train_csv_file, output_folder):
    if path.exists(path.join(output_folder, 'train_acoustic_data.npy')):
        print('Numpy array exists, skipping')
        return
    
    train_df_chunked = pd.read_csv(train_csv_file, chunksize=10000000)
    acoustic_data_filepath = path.join(output_folder, 'train_acoustic_data')
    
    ttf_filepath = path.join(output_folder, 'train_time_to_failure')
    acoustic_data = list()
    time_to_failure = list()

    for chunk in train_df_chunked:
        a = chunk['acoustic_data'].values
        t = chunk['time_to_failure'].values
        acoustic_data.append(a.astype(np.int16))
        time_to_failure.append(t.astype(np.float32))
    
    acoustic_data = np.concatenate(acoustic_data)
    time_to_failure = np.concatenate(time_to_failure)
    np.save(acoustic_data_filepath, acoustic_data)
    np.save(ttf_filepath, time_to_failure)
        
    

In [4]:
create_numpy_files(train_csv_file=train_file, output_folder='data')

Numpy array exists, skipping


In [5]:
def strided_app(a, L, S):
    nrows = ((a.shape[0]-L)//S)+1
    
    n = a.strides[0]    
    if len(a.shape) > 1:
        ncols = a.shape[1]        
        strides = (S*n, n, a.strides[1])
    else:
        ncols = 1
        strides = (S*n,n)
    shape = (nrows,L, ncols)
    
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=(S*n,n, a.strides[1]))

In [6]:
def get_subsequences(x, window_size, step=1):
    # x : (batch_size, seq_len, features)
    # Split a batch of sequence data to generate all the subsequences
    # with a step size of 1 are returned.        
    batch_size = x.shape[0]
    seq_len = x.shape[1]
    if len(x.shape) > 2:
        num_features = x.shape[2]
    else:
        num_features = 1
        
    
    assert(window_size <= seq_len)
    num_steps = (seq_len - window_size + 1)//step
    nrows = ((seq_len - window_size)//step) + 1
    
    batched_subsequences = np.empty((batch_size, nrows, window_size, num_features))
    for i in range(batch_size):
        batched_subsequences[i,:,:] = strided_app(x[i], window_size, step)
    
    for i in range(nrows):
        yield batched_subsequences[:,i]

In [7]:
def load_train_data_from_numpy_files(folder):
    acoustic_data = np.load(path.join(folder, 'train_acoustic_data.npy'))
    time_to_failure = np.load(path.join(folder, 'train_time_to_failure.npy'))
    return acoustic_data, time_to_failure

In [8]:
acoustic_data, time_to_failure = load_train_data_from_numpy_files('data')

In [9]:
acoustic_data = acoustic_data.astype(np.float32)

In [10]:
acoustic_data = np.expand_dims(acoustic_data, axis=1)
scaler = preprocessing.StandardScaler(copy=False)
acoustic_data = scaler.fit_transform(acoustic_data)


In [18]:
acoustic_data.shape

(629145480, 1)

In [24]:
SEGMENT_SIZE = 165000
remainder = time_to_failure.shape[0] % 165000
X = acoustic_data[:-remainder]
y = time_to_failure[:-remainder]
X_segments = np.reshape(X, (-1, SEGMENT_SIZE, 1))
y_segments = np.reshape(y, (-1, SEGMENT_SIZE, 1))

In [28]:
split_idx = int(X_segments.shape[0]*0.8)

In [29]:
X_train = X_segments[:split_idx]
y_train = y_segments[:split_idx]
X_test = X_segments[split_idx:]
y_test = y_segments[split_idx:]

In [34]:
batch_size = 32
num_epochs = 1