In [1]:
import numpy as np
import pandas as pd 
from glob import glob
from os import path
import matplotlib.pyplot as plt
from torch.utils.data.dataset import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing
from torch.autograd import Variable
import torch.utils.data as utils
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [8]:
train_file = r'data/train/train.csv'
test_path = r'data/test'
test_files = glob(path.join(test_path, '*.csv'))

In [9]:
def create_numpy_files(train_csv_file, output_folder):
    if path.exists(path.join(output_folder, 'train_acoustic_data.npy')):
        print('Numpy array exists, skipping')
        return
    
    train_df_chunked = pd.read_csv(train_csv_file, chunksize=10000000)
    acoustic_data_filepath = path.join(output_folder, 'train_acoustic_data')
    
    ttf_filepath = path.join(output_folder, 'train_time_to_failure')
    acoustic_data = list()
    time_to_failure = list()

    for chunk in train_df_chunked:
        a = chunk['acoustic_data'].values
        t = chunk['time_to_failure'].values
        acoustic_data.append(a.astype(np.int16))
        time_to_failure.append(t.astype(np.float32))
    
    acoustic_data = np.concatenate(acoustic_data)
    time_to_failure = np.concatenate(time_to_failure)
    np.save(acoustic_data_filepath, acoustic_data)
    np.save(ttf_filepath, time_to_failure)
        
    

In [10]:
create_numpy_files(train_csv_file=train_file, output_folder='data')

In [118]:
def strided_app(a, L, S):
    nrows = ((a.shape[0]-L)//S)+1
    
    n = a.strides[0]    
    if len(a.shape) > 1:
        ncols = a.shape[1]        
        strides = (S*n, n, a.strides[1])
    else:
        ncols = 1
        strides = (S*n,n)
    shape = (nrows,L, ncols)
    
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=(S*n,n, a.strides[1]))

In [162]:
def get_subsequences(x, window_size, step=1):
    # x : (batch_size, seq_len, features)
    # Split a batch of sequence data to generate all the subsequences
    # with a step size of 1 are returned.        
    batch_size = x.shape[0]
    seq_len = x.shape[1]
    if len(x.shape) > 2:
        num_features = x.shape[2]
    else:
        num_features = 1
        
    
    assert(window_size <= seq_len)
    num_steps = (seq_len - window_size + 1)//step
    nrows = ((seq_len - window_size)//step) + 1
    
    batched_subsequences = np.empty((batch_size, nrows, window_size, num_features))
    for i in range(batch_size):
        batched_subsequences[i,:,:] = strided_app(x[i], window_size, step)
    
    for i in range(nrows):
        yield batched_subsequences[:,i]

In [206]:
def load_train_data_from_numpy_files(folder):
    acoustic_data = np.load(path.join(folder, 'train_acoustic_data.npy'))
    time_to_failure = np.load(path.join(folder, 'train_time_to_failure.npy'))
    return acoustic_data, time_to_failure

In [217]:
acoustic_data, time_to_failure = load_train_data_from_numpy_files('data')

In [218]:
acoustic_data = acoustic_data.astype(np.float32)

In [219]:
acoustic_data = np.expand_dims(acoustic_data, axis=1)
scaler = preprocessing.StandardScaler(copy=False)
acoustic_data = scaler.fit_transform(acoustic_data)

In [220]:
SEGMENT_SIZE = 150000
class LANLDataset(Dataset):
    def __init__(self, acoustic_data, time_to_failure, segment_size=SEGMENT_SIZE, split_fraction=0.8):        
        self.acoustic_data = acoustic_data
        self.time_to_failure = time_to_failure
        self.segment_size = segment_size
        
    def __getitem__(self, index):
        a = torch.from_numpy(self.acoustic_data[index : index + self.window_size])
        t = torch.from_numpy(self.time_to_failure[index : index + self.window_size])
        return a, t

    def __len__(self):
        return self.acoustic_data.size



In [221]:
batch_size = 32

In [71]:
dataset = LANLDataset(acoustic_data, time_to_failure, window_size=10)
dataset_loader = torch.utils.data.DataLoader(dataset=dataset,
                                                    batch_size=batch_size,
                                                    shuffle=False)

In [11]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()        
        num_layers = 16
        self.lstm1 = nn.LSTM(input_size=1, hidden_size=64, num_layers=num_layers, batch_first=True)
        self.fc1 = nn.Linear(64, 1)
        self.num_layers = num_layers
        self.reset_states()
        
    def reset_states(self):
        self.h1 = Variable(torch.zeros(self.num_layers, batch_size, 64, dtype=torch.float32))
        self.c1 = Variable(torch.zeros(self.num_layers, batch_size, 64, dtype=torch.float32))
    
    def forward(self, x):
        lstm_output, (h1, c1) = self.lstm1(x, (self.h1, self.c1))
        self.h1 = Variable(h1)
        self.c1 = Variable(c1)
        y_pred = self.fc1(lstm_output)
        return y_pred.squeeze(-1)
        

In [12]:
net = Net()


In [13]:
learning_rate = 1e-3
num_epochs = 10

In [14]:
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)


In [15]:
hist = list()

In [16]:
A = 5600000
B = 5700000

In [17]:
X = acoustic_data[A:B]
y = time_to_failure[A:B]

In [18]:
dataset = utils.TensorDataset(torch.from_numpy(X), torch.from_numpy(y))
dataloader = utils.DataLoader(dataset, batch_size=batch_size)

In [19]:
hist = list()
    

In [29]:
X.shape

(100000, 1)

In [31]:
yyy = net.forward(torch.from_numpy(X).un)

RuntimeError: input must have 3 dimensions, got 2

In [27]:
for i in range(1):
    net.reset_states()
    
    for j,(batch_data, batch_label) in enumerate(dataloader):
        optimizer.zero_grad()
        
        y_pred = net(batch_data.unsqueeze(-1))

        loss = loss_fn(y_pred, batch_label.unsqueeze(-1))

        hist.append(loss.item())
        loss.backward()

        optimizer.step()
        if j % 100 == 0:
            print(j)
    print('finished')


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
finished


In [35]:
for i in range(1):
    net.reset_states()
    for (j, (batch_data, batch_label)) in enumerate(dataset_loader):
        if j == SEGMENT_SIZE:
            break
            net.reset_states()
            
        net.zero_grad()
        
        optimizer.zero_grad()
        
        y_pred = net(batch_data)
        
        loss = loss_fn(y_pred, batch_label)
            
        hist.append(loss.item())
        loss.backward()
        
        optimizer.step()
        print(j)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78


KeyboardInterrupt: 