In [22]:
import numpy as np
import pandas as pd 
from glob import glob
from os import path
import matplotlib.pyplot as plt
from torch.utils.data.dataset import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing
from torch.autograd import Variable
import torch.utils.data as utils
import tensorflow as tf

In [23]:
train_file = r'data/train/train.csv'
test_path = r'data/test'
test_files = glob(path.join(test_path, '*.csv'))

In [24]:
def create_numpy_files(train_csv_file, output_folder):
    if path.exists(path.join(output_folder, 'train_acoustic_data.npy')):
        print('Numpy array exists, skipping')
        return
    
    train_df_chunked = pd.read_csv(train_csv_file, chunksize=10000000)
    acoustic_data_filepath = path.join(output_folder, 'train_acoustic_data')
    
    ttf_filepath = path.join(output_folder, 'train_time_to_failure')
    acoustic_data = list()
    time_to_failure = list()

    for chunk in train_df_chunked:
        a = chunk['acoustic_data'].values
        t = chunk['time_to_failure'].values
        acoustic_data.append(a.astype(np.int16))
        time_to_failure.append(t.astype(np.float32))
    
    acoustic_data = np.concatenate(acoustic_data)
    time_to_failure = np.concatenate(time_to_failure)
    np.save(acoustic_data_filepath, acoustic_data)
    np.save(ttf_filepath, time_to_failure)
        
    

In [25]:
create_numpy_files(train_csv_file=train_file, output_folder='data')

Numpy array exists, skipping


In [28]:
def load_train_data_from_numpy_files(folder):
    acoustic_data = np.load(path.join(folder, 'train_acoustic_data.npy'))
    time_to_failure = np.load(path.join(folder, 'train_time_to_failure.npy'))
    return acoustic_data, time_to_failure

In [29]:
acoustic_data, time_to_failure = load_train_data_from_numpy_files('data')

In [30]:
acoustic_data = acoustic_data.astype(np.float32)
acoustic_data = np.expand_dims(acoustic_data, axis=1)

In [31]:
ends_mask = np.less(time_to_failure[:-1], time_to_failure[1:])
segment_ends = np.nonzero(ends_mask)[0]

In [32]:
segments = list()
start = 0
for e in segment_ends:
    segments.append((start, e))
    start = e


In [33]:
segments_split_idx = int(0.8*len(segments))

In [48]:
train_segments = segments[:segments_split_idx]
test_segments = segments[segments_split_idx:]

In [52]:
scaler = preprocessing.StandardScaler(copy=False)

In [67]:
for se in train_segments:
    start = se[0]
    end = se[1]
    scaler.partial_fit(acoustic_data[start:end])

In [53]:
SEGMENT_SIZE = 150000
step_size = 1000

In [88]:
acoustic_data.shape[0]

629145480

In [89]:
T = np.arange(0,acoustic_data.shape[0])
W = np.arange(0,acoustic_data.shape[0])

In [99]:
class LANLDataset(Dataset):
    def __init__(self, X, y, segments, step_size, scaler=None):
        self.X = X 
        self.y = y 
        self.segments = segments 
        self.num_segments = len(segments)
        self.segment_idx = 0
        self.sequence_idx = 0
        self.step_size = step_size 
        self.scaler = scaler
        self.len = 0
        self._calculate_len()
        
    def _calculate_len(self):        
        for s in segments:
            start = s[0]
            end = s[1]
            L = ((end - start - SEGMENT_SIZE)//self.step_size) + 1 
            self.len += L
            
    def __len__(self):
        return self.len        
    
    def __getitem__(self, idx):
        step_size = self.step_size
        si = self.segment_idx
        start = self.segments[si][0]
        end = self.segments[si][1]
        seq_idx = self.sequence_idx
        
        
        start_offset = start + (seq_idx*step_size)
        end_offset = start_offset + SEGMENT_SIZE
        
        self.sequence_idx += 1
        if end_offset >= end:
            start_offset = end - SEGMENT_SIZE
            end_offset = end
            self.sequence_idx = 0
            self.segment_idx  = (self.segment_idx + 1)%self.num_segments
            
        data = self.X[start_offset:end_offset]
        targets = self.y[start_offset:end_offset]
        if self.scaler is not None:
            data = self.scaler.fit(data)
        
        return data, targets
        
            
            
            
        
        

In [100]:
lanl_dataset = LANLDataset(T, W, train_segments, step_size)

In [101]:
dataloader = torch.utils.data.DataLoader(lanl_dataset, batch_size=4, shuffle=False, num_workers=2)