## Simple EDA to understand the dataset
- whats in our dataset
- what do they look like

In [1]:
import pandas as pd
import numpy as np

In [31]:
%%time
train = pd.read_csv('./data/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32})
# note: this process took 2min 10scs on my computer

CPU times: user 2min 15s, sys: 14.9 s, total: 2min 30s
Wall time: 2min 29s


In [3]:
train.head()

Unnamed: 0,acoustic_data,time_to_failure
0,12,1.4691
1,6,1.4691
2,8,1.4691
3,5,1.4691
4,8,1.4691


In [4]:
train.shape

(629145480, 2)

In [5]:
%%time
sample_sub = pd.read_csv('./data/sample_submission.csv')

CPU times: user 3.75 ms, sys: 2.55 ms, total: 6.3 ms
Wall time: 7.23 ms


In [6]:
sample_sub.head()

Unnamed: 0,seg_id,time_to_failure
0,seg_00030f,0
1,seg_0012b5,0
2,seg_00184e,0
3,seg_003339,0
4,seg_0042cc,0


In [7]:
sample_test = pd.read_csv('./data/test/seg_0a0fbb.csv')

In [8]:
sample_test.head()

Unnamed: 0,acoustic_data
0,3
1,-3
2,-1
3,6
4,8


In [9]:
sample_test.shape

(150000, 1)

## LSTM Using Pytorch

In [36]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [56]:
# we only use mean and std here just for testing at the current stage
def feature_extraction(time_step):
    return np.c_[time_step.mean(axis=1), 
                 np.percentile(np.abs(time_step), q=[0, 25, 50, 75, 100], axis=1).T,
                 time_step.std(axis=1)]

In [57]:
# note: window_size * seq_len = 150000
def create_X(x, window_size=1000, seq_len=150):
    X = x.reshape(seq_len, -1)
    return np.c_[feature_extraction(X),
                 feature_extraction(X[:, -window_size // 10:]),]

In [58]:
features = create_X(train.acoustic_data.values[0:150000]).shape[1]

In [59]:
features

14

In [64]:
class TrainData(Dataset):
    def __init__(self, df, window_size=1000, sequence_len=150):
        self.rows = df.shape[0] // (window_size*sequence_len)
        self.data, self.labels = [], []
        
        for s in range(self.rows):
            seg = df.iloc[s*window_size*sequence_len: (s+1)*window_size*sequence_len]
            x = seg.acoustic_data.values
            y = seg.time_to_failure.values[-1]
            self.data.append(create_X(x))
            self.labels.append(y)
            
    def __len__(self):
        return self.rows
    
    def __getitem__(self, idx):
        return (
            torch.from_numpy(self.data[idx].astype(np.float32)),
            self.labels[idx]
        )
            

In [65]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        hidden = (
            torch.zeros(1, x.size(0), self.hidden_size),
            torch.zeros(1, x.size(0), self.hidden_size)
        )
        
        out, _ = self.lstm(x, hidden)
        
        out = self.fc(out[:, -1, :])
        return out.view(-1)

In [71]:
def train_model(input_size, hidden_size, train_data, lr = 0.01, batch_size=100):
    
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False)
    model = LSTM(input_size, hidden_size)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    

    
    for epoch in range(2):
        for i, (data, labels) in enumerate(train_loader):
            outputs = model(data)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if i % 10 == 0:
                print(f'[Epoch {epoch}/2, Step {i}/{n_steps}]  loss: {loss.item(): .4f}')
    

In [68]:
train_data = TrainData(train)

In [70]:
train_model(features, 32, train_data)

UnboundLocalError: local variable 'model' referenced before assignment