# <center>LSTM

In [1]:
import os
os.chdir("..")

%load_ext autoreload
%autoreload 2

In [69]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm

In [3]:
data_path = Path("data")

In [4]:
example_predictions_name = "example_predictions.pkl"
test_features_name = "test_features.pkl"
train_features_name = "train_features.pkl"
train_outcomes_name = "train_outcomes.pkl"

In [5]:
pd.set_option('display.max_columns', 600)
pd.set_option('display.max_rows', 10)

In [6]:
example_predictions = pd.read_pickle(data_path / example_predictions_name)
example_predictions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,g__12m_binary
date,tradingitemid,Unnamed: 2_level_1
2006-12-29,2585893,0.541588
2006-12-29,2585895,0.522537
2006-12-29,2585941,0.500847
2006-12-29,2585945,0.496815
2006-12-29,2585957,0.552594


In [7]:
test_features = pd.read_pickle(data_path /test_features_name)
train_features = pd.read_pickle(data_path / train_features_name)
train_outcomes = pd.read_pickle(data_path / train_outcomes_name)

In [8]:
from dataprocessing.process import StandardScaler, SmartLogtransformer, TrainTestSplit, Pipeline
from dataprocessing.impute import SimpleImputer

In [9]:
train_test_split = TrainTestSplit(test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(train_features, train_outcomes)

### Pipeline for data processing

In [10]:
pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")), 
        ("logtransformer", SmartLogtransformer()), 
        ("scaler", StandardScaler())
    ]
)
pipe

[(imputer, <dataprocessing.impute.SimpleImputer>), 
(logtransformer, <dataprocessing.process.SmartLogtransformer>), 
(scaler, <dataprocessing.process.StandardScaler>)]

In [11]:
%%time
X_train = pipe.fit_transform(X_train)
X_val = pipe.transform(X_val)
X_test = pipe.transform(test_features)

Wall time: 5.09 s


In [12]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,f__buytransactions_1m,f__selltransactions_1m,f__buyshares_1m,f__sellshares_1m,f__buysharespctsharesout_1m,f__sellsharespctsharesout_1m,f__uniquebuyers_1m,f__uniquesellers_1m,f__buytransactions_3m,f__selltransactions_3m,f__buyshares_3m,f__sellshares_3m,f__buysharespctsharesout_3m,f__sellsharespctsharesout_3m,f__uniquebuyers_3m,f__uniquesellers_3m,f__buytransactions_6m,f__selltransactions_6m,f__buyshares_6m,f__sellshares_6m,f__buysharespctsharesout_6m,f__sellsharespctsharesout_6m,f__uniquebuyers_6m,f__uniquesellers_6m,f__buytransactions_12m,f__selltransactions_12m,f__buyshares_12m,f__sellshares_12m,f__buysharespctsharesout_12m,f__sellsharespctsharesout_12m,f__uniquebuyers_12m,f__uniquesellers_12m
date,tradingitemid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
2004-08-06,2585893,1.124676,-0.745641,-0.443263,0.003256,-0.372335,0.000805,1.137242,-0.745811,0.667193,0.795100,-0.849923,0.004573,-0.885121,0.000811,0.689566,0.775471,0.430536,0.560260,-1.313943,0.007727,-1.489867,0.000819,0.471217,0.550649,0.273003,0.383375,-1.921067,0.007480,-2.278363,0.000844,0.338890,0.389804
2004-08-06,2585895,1.173487,1.348587,-0.288378,0.003541,0.005511,0.000805,1.186371,1.368299,0.637492,0.843336,-0.659300,0.005440,-0.421348,0.000813,0.638244,0.853088,0.396030,0.611525,-1.123632,0.008496,-0.966126,0.000821,0.411208,0.633622,0.234136,0.439430,-1.718303,0.007815,-1.676731,0.000846,0.270767,0.481275
2004-08-06,2585941,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
2004-08-06,2585957,-0.821051,1.268505,0.027006,0.003540,0.087960,0.000805,-0.821200,1.287456,0.637492,0.795100,0.210314,0.005433,-0.230532,0.000813,0.608222,0.824442,0.396030,0.560260,-0.255439,0.008490,-0.750636,0.000821,0.376105,0.602999,0.234136,0.383375,-0.793301,0.007813,-1.429194,0.000846,0.230917,0.447516
2004-08-06,2585970,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-11-23,586093745,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
2012-11-23,607745150,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
2012-11-23,610592433,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768
2012-11-23,618650356,-0.821051,-0.745641,0.027006,0.003256,0.087960,0.000805,-0.821200,-0.745811,-1.488105,-1.175993,0.058943,0.005130,0.098922,0.000812,-1.488955,-1.176650,-2.073409,-1.534573,0.092823,0.007950,0.100471,0.000819,-2.076055,-1.536194,-2.547422,-1.907210,0.089716,0.007298,0.039101,0.000843,-2.552820,-1.910768


#### perform data to the 3D sequence data

In [20]:
def get_features3D_labels2D_lengths(X, y):
    # init
    lengths = []
    
    tr_shape1 = len(X.index.get_level_values(1).unique())
    tr_shape2 = len(X.index.get_level_values(0).unique())
    tr_shape3 = X.shape[1]
    
    features_3D = np.zeros((tr_shape1, tr_shape2, tr_shape3))
    labels_2D = np.zeros((tr_shape1, tr_shape2))
    
    idx = pd.IndexSlice
    for num, compID in enumerate(X.index.get_level_values(1).unique()):
        df_feat = X.loc[idx[:, compID], idx[:]]
        df_labels = y.loc[idx[:, compID], idx[:]]
        features_3D[num, :len(df_feat), :] = df_feat.values
        labels_2D[num, :len(df_labels)] = df_labels.values.flatten()
        lengths.append(len(df_feat))
    lengths = np.array(lengths)
    return features_3D, labels_2D, lengths

In [21]:
%%time
features_3D_train, labels_2D_train, lengths_train = get_features3D_labels2D_lengths(X_train, y_train)

Wall time: 5min 18s


In [22]:
features_3D_train.shape, labels_2D_train.shape, lengths_train.shape

((8335, 434, 32), (8335, 434), (8335,))

In [27]:
%%time
features_3D_val, labels_2D_val, lengths_val = get_features3D_labels2D_lengths(X_val, y_val)

Wall time: 49.8 s


In [28]:
features_3D_val.shape, labels_2D_val.shape, lengths_val.shape

((4663, 109, 32), (4663, 109), (4663,))

In [29]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.nn.functional as F

In [88]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [152]:
class SeqDataset(Dataset):
    
    def __init__(self, features_3D, labels_2D, lengths):
        self.features_3D = features_3D
        self.labels_2D = labels_2D
        self.lengths = lengths
        self.sorted_data = sorted(zip(features_3D, labels_2D, lengths), key=lambda x: x[2], reverse=True)
            
    def __getitem__(self, idx):
        return self.sorted_data[idx]
    
    def __len__(self):
        return len(self.sorted_data)

In [153]:
train_dataset = SeqDataset(features_3D_train, labels_2D_train, lengths_train)
train_loader = DataLoader(train_dataset, batch_size=64)

In [154]:
val_dataset = SeqDataset(features_3D_val, labels_2D_val, lengths_val)
val_loader = DataLoader(val_dataset, batch_size=64)

In [155]:
class Model(nn.Module):
    def __init__(self, emb_dim, hid_dim, output_dim, n_layers, dropout, bidirectional):
        super().__init__()
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.bidirectional = bidirectional
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers, dropout=dropout, bidirectional=bidirectional)
        self.fc1 = nn.Linear(hid_dim, hid_dim)
        self.fc2 = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, inp, lens):
        inp = inp.permute(1, 0, 2)
        packed_seq = nn.utils.rnn.pack_padded_sequence(inp, lens.cpu(), enforce_sorted=False)
        packed_output, (hidden, cell) = self.rnn(packed_seq)
        output = F.relu(self.fc1(packed_output.data))
        output = self.fc2(output)
        return output

In [160]:
emb_dim = 32
hid_dim = 100
output_dim = 1
n_layers = 2
dropout = 0.5
bidirectional = False

patience = 3

In [157]:
model = Model(emb_dim, hid_dim, output_dim, n_layers, dropout, bidirectional).to(device)

In [158]:
optimizer = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

epochs = 1

In [159]:
min_loss = np.inf

cur_patience = 0

for epoch in range(1, epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 
        optimizer.zero_grad()
        feats, labs, lens = batch
        labs = nn.utils.rnn.pack_padded_sequence(labs.permute(1, 0), lens.cpu(), enforce_sorted=False).data
        predict = model(feats.float(), lens)
#         print("predict.flatten().shape", predict.flatten().shape, "labs.flatten().shape", labs.flatten().shape)
#         print("lens", lens)
        loss = loss_func(predict.flatten(), labs)
        loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(val_loader), total=len(val_loader), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
        feats, labs, lens = batch
        labs = nn.utils.rnn.pack_padded_sequence(labs.permute(1, 0), lens.cpu(), enforce_sorted=False).data
        predict = model(feats.float(), lens)
        loss = loss_func(predict.flatten(), labs)
        val_loss += loss.item()
    val_loss /= len(val_loader)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print(f'Epoch: {epoch}, Training Loss: {train_loss}, Validation Loss: {val_loss}')

  0%|          | 0/131 [00:00<?, ?it/s]

  0%|          | 0/73 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.5880597013566955, Validation Loss: 0.938692019578655
