In [1]:
import numpy as np
import pandas as pd 
from glob import glob
from os import path
import matplotlib.pyplot as plt
from torch.utils.data.dataset import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import preprocessing
from torch.autograd import Variable
import torch.utils.data as utils
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
train_file = r'data/train/train.csv'
test_path = r'data/test'
test_files = glob(path.join(test_path, '*.csv'))

In [3]:
def create_numpy_files(train_csv_file, output_folder):
    if path.exists(path.join(output_folder, 'train_acoustic_data.npy')):
        print('Numpy array exists, skipping')
        return
    
    train_df_chunked = pd.read_csv(train_csv_file, chunksize=10000000)
    acoustic_data_filepath = path.join(output_folder, 'train_acoustic_data')
    
    ttf_filepath = path.join(output_folder, 'train_time_to_failure')
    acoustic_data = list()
    time_to_failure = list()

    for chunk in train_df_chunked:
        a = chunk['acoustic_data'].values
        t = chunk['time_to_failure'].values
        acoustic_data.append(a.astype(np.int16))
        time_to_failure.append(t.astype(np.float32))
    
    acoustic_data = np.concatenate(acoustic_data)
    time_to_failure = np.concatenate(time_to_failure)
    np.save(acoustic_data_filepath, acoustic_data)
    np.save(ttf_filepath, time_to_failure)
        
    

In [67]:
create_numpy_files(train_csv_file=train_file, output_folder='data')

Numpy array exists, skipping


In [68]:
def load_train_data_from_numpy_files(folder):
    acoustic_data = np.load(path.join(folder, 'train_acoustic_data.npy'))
    time_to_failure = np.load(path.join(folder, 'train_time_to_failure.npy'))
    return acoustic_data, time_to_failure

In [69]:
acoustic_data, time_to_failure = load_train_data_from_numpy_files('data')

In [70]:
acoustic_data = np.expand_dims(acoustic_data, axis=1)
time_to_failure = np.expand_dims(time_to_failure, axis=1)

In [71]:
SEGMENT_SIZE = 150000
step_size = 5000

In [72]:
ends_mask = np.less(time_to_failure[:-1], time_to_failure[1:])
segment_ends = np.nonzero(ends_mask)[0]

In [73]:
del ends_mask

In [74]:
segments = list()
start = 0
for e in segment_ends:
    segments.append((start, e))
    start = e


In [75]:
segments_split_idx = int(0.8*len(segments))

In [76]:
train_segments = segments[:segments_split_idx]
test_segments = segments[segments_split_idx:]

In [77]:
scaler = preprocessing.StandardScaler(copy=False)

In [78]:
for se in train_segments:
    start = se[0]
    end = se[1]
    scaler.partial_fit(acoustic_data[start:end])



In [79]:
class LANLDataset(Dataset):
    def __init__(self, X, y, segments, step_size, segment_size=SEGMENT_SIZE, scaler=None):
        self.X = X 
        self.y = y 
        self.segments = segments 
        self.segment_idx = 0
        self.step_size = step_size 
        self.segment_size = segment_size
        self.scaler = scaler        
        self._calculate_len()
        self._assign_iterator()

        
    def _calculate_len(self):   
        self.len = 0
        for s in segments:
            start = s[0]
            end = s[1]
            assert(end > start + self.segment_size)
            L = len(range(start + self.segment_size, end, self.step_size)) + 1
            self.len += L
            
    def _assign_iterator(self):
        idx = (self.segment_idx + 1) % len(self.segments)
        start = self.segments[idx][0] + self.segment_size
        end = self.segments[idx][1]
        self.range = range(start, end, self.step_size)
        self.iterator = iter(self.range)
        
            
    def __len__(self):
        return self.len        
    
    def __getitem__(self, idx):
        try:
            end_offset = next(self.iterator)            
        except StopIteration:
            end_offset = self.range.stop                                        
            self._assign_iterator()
                
        start_offset = end_offset - self.segment_size            
            
        data = self.X[start_offset:end_offset].astype(np.float32)
        targets = self.y[end_offset]
        if self.scaler is not None:
            data = self.scaler.transform(data, copy=True)
        
        return data, targets
            
        
            
            
            
        
        

In [80]:
batch_size=4

In [81]:
def test_lanl_dataset(acoustic_data, time_to_failure, segments):    
    dataset = LANLDataset(acoustic_data, time_to_failure, segments, 
                           step_size=step_size, segment_size=SEGMENT_SIZE)

    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)

    it = dataloader.__iter__()
    for ii,segment in enumerate(segments_):
        start = segment[0]
        end = segment[1]
        stop = False
        for j in range(start, end, step_size):
            xx = acoustic_data[j:j+SEGMENT_SIZE]      
            yy = time_to_failure[j+SEGMENT_SIZE]

            try:
                aa, bb = next(it)
            except StopIteration:
                print('STOP ITERATION ', ii, j)
            if j + SEGMENT_SIZE >= end:
                xx = acoustic_data[end - SEGMENT_SIZE:end]
                yy = time_to_failure[end]
                stop=True

            error = np.sum(xx-aa.numpy()[0])
            if error != 0.0:
                print('ERROR ', j, error)

            if stop:
                break
    try:
        next(it)
    except StopIteration:
        print('done')
        

    
    
    

In [82]:
# test_lanl_dataset(acoustic_data, time_to_failure, segments)

In [83]:
train_segments = [segments[0]]

In [84]:
lanl_train_dataset = LANLDataset(acoustic_data, time_to_failure, train_segments, 
                           step_size=step_size, segment_size=SEGMENT_SIZE, scaler=scaler)

In [85]:
train_dataloader = torch.utils.data.DataLoader(lanl_train_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

In [86]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [87]:
class LANLModel(nn.Module):
    def __init__(self, batch_size, segment_size=SEGMENT_SIZE):
        super(LANLModel, self).__init__()
        self.batch_size = batch_size
        self.segment_size = segment_size
        self.conv1 = nn.Conv1d(1,16,kernel_size=11,stride=7)
        self.pool1 = nn.AvgPool1d(10)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(self.conv1.out_channels, 32, kernel_size=5, stride=3)
        self.pool2 = nn.AvgPool1d(3)
        self.lstm = nn.LSTM(input_size=237, hidden_size=64, num_layers=1, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(128, 1)
        
        
    def forward(self, x):
        x = x.transpose(-1,-2)
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.relu(x)
        x,_ = self.lstm(x)
        
        x = self.linear(x[:,-1])
        return x
        

In [88]:
learning_rate = 1e-3
model = LANLModel(batch_size=batch_size)
loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [89]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [90]:
count_parameters(model)

158049

In [91]:
if torch.cuda.is_available():
    model.cuda()

In [92]:
num_epochs = 1

optimizer.zero_grad()
for e in range(num_epochs):
    losses = []
    for ii,(batch_data, batch_label) in enumerate(train_dataloader):    
        batch_data, batch_label = batch_data.to(device), batch_label.to(device)
        
        y_pred = model(batch_data)
        loss = loss_fn(y_pred, batch_label)
        losses.append(loss.item())
        if (ii % 100) == 0:
            print(np.mean(losses))
            losses = []
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        



2.011781930923462
0.07692489345589365
0.0002556336320412811
0.12497828779459269
0.0045576704796447135
0.0008293036422128353
0.15043276524438626
0.0007396474149300047
0.0007766483907471411
0.17401542290463112
0.0007846757121171777
0.00040390212469901596
0.12279064118448786
0.0004999761789804324
0.11673823950821031
0.004220626076485132
0.0004219115480259461
0.1842719793283868
0.0017650774040748728
0.0011698018601782679
0.1661822741459946


KeyboardInterrupt: 