In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as utils_data

In [None]:
# ALL GLOBAL VARIABLES

GLOBAL_DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
GLOBAL_SEED = 42
PADDING_CONST = 0

In [None]:
!nvidia-smi

Mon Dec  7 18:52:56 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    32W / 250W |   1593MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
torch.cuda.empty_cache()

In [None]:
def compute_integral_mc(intensity_network, hidden_states, cond_lam, src_padding_mask, time, events, alpha=-0.1, n_samples=100):
    """
    Compute integral using Monte Carlo integration.
    """
    # time differences (t_{j} - t{j-1})
    dt = (time[:, 1:] - time[:, :-1]) * (~src_padding_mask[:, 1:])

    # sample t from uniform distribution: 
    # since t \in (t_{j-1}, t_j) which would lead to (t - t_{j-1}) / t_{j-1} \in [0, (t_j - t_{j-1}) / t_{j-1}),
    # we can reformulate this as (t_j - t_{j-1}) / t_{j-1} * u, where u \in [0, 1)
    current_influence = alpha * (dt.unsqueeze(2) / (time[:, :-1] + 1).unsqueeze(2)) * torch.rand([*dt.size(), n_samples], device=cond_lam.device)

    # compute sum( lambda(u_i) ) / N
    mc_intensity = intensity_network(hidden_states, events, current_inf=current_influence, mc_trick=True)

    return dt * mc_intensity

def compute_integral_li(lam, time, src_padding_mask):
    dt = (time[:, 1:] - time[:, :-1]) * (~src_padding_mask[:, 1:])
    dlam = (lam[:, 1:] - lam[:, :-1]) * (~src_padding_mask[:, 1:])

    integral = dt * dlam
    return 0.5 * integral

In [None]:
class IntensityNetwork(nn.Module):

    def __init__(self, hidden_size, n_event_types, device):
        """
        Inputs:
            hidden_size (int) - size of the hidden dimension (d_model),
            n_event_types (int) - number of possible event types in the data
        """
        super(IntensityNetwork, self).__init__()

        self.n_events = n_event_types
        self.device = device

        # accounts for "history" and "base" (through bias) terms in eq.(6) of the paper
        self.linear = nn.Linear(hidden_size, n_event_types)
        self.softplus = nn.Softplus(threshold=10)

    def generate_type_mask(self, events):
        bs, ls = events.size()

        type_mask = torch.zeros(bs, ls, self.n_events, device=self.device)
        for k in range(self.n_events):
            type_mask[:, :, k] = (events == k + 1).bool().to(self.device)
        return type_mask
    
    def forward(self, hidden_states, events, current_inf=None, mc_trick=False):
        intensity_terms = self.linear(hidden_states)
        type_mask = self.generate_type_mask(events)

        if mc_trick:
            # this is a trick for Monte-Carlo integration, which allows to vectorize
            # computation of (num_samples) intensity functions instead of making a loop

            assert current_inf is not None, "current influence cannot be None when mc_trick is True"

            intensity_terms = (intensity_terms[:, :-1, :] * type_mask[:, :-1, :]).sum(dim=2, keepdim=True)
            continious_intensity = self.softplus( intensity_terms + current_inf )
            conditional_lambda = continious_intensity.mean(dim=2)
        else:
            if current_inf is not None:
                intensity_terms += current_inf
            continious_intensity = self.softplus(intensity_terms)

            # (continious_intensity * type_mask) gets type-specific instensity function (eq. (6))
            # after summation along the 2nd dimension, conditional intensity function is obtained
            conditional_lambda = (continious_intensity * type_mask).sum(dim=2)
        
        return conditional_lambda

In [None]:
class HawkesTransformer(nn.Module):

    def __init__(self, n_event_types, device, d_model=512, n_heads=8, dim_feedforward=2048, n_layers=6, dropout=0.1, activation='relu'):
        """
        Input parameters:
          n_event_types (int) - number of event types in the data,
          d_model (int) - size of model's latent dimension,
          n_heads (int) - number of heads in the Multihead Attention module,
          dim_feedforward (int) - size of the feedforward network dimension,
          n_layers (int) - number of Transformer encoder layers,
          dropout (float) - dropout rate,
          activation (string) - activation function for the feedforward network (relu or gelu)
        """
        super(HawkesTransformer, self).__init__()

        self.n_events = n_event_types
        self.d_model = d_model
        self.device = device

        # initialize div term for temporal encoding
        self.init_temporal_encoding()

        # event type embedding
        self.event_embedding = nn.Embedding(n_event_types + 1, d_model, padding_idx=PADDING_CONST)

        # transformer encoder layers
        encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward, dropout, activation)
        self.transformer_layers = nn.TransformerEncoder(encoder_layer, n_layers)

        # linear transformation of hidden states ("history" and "base" terms in eq.(6) of the THP paper) to
        # type specific intensity
        self.intensity_layer = IntensityNetwork(d_model, n_event_types, self.device)

        # output prediction layers
        self.time_predictor  = nn.Linear(d_model, 1, bias=False)
        self.event_predictor = nn.Linear(d_model, n_event_types, bias=False)

        # small constant
        self.eps = torch.tensor([1e-8], device=self.device)

    def generate_subsequent_mask(self, seq):
        """
        Function to generate masking for the subsequent information in the sequences (masked self-attention).
        Input:
          seq (B, S, F) - batch of sequences.
        """
        bs, ls = seq.size()
        subsequent_mask = torch.triu( torch.ones(ls, ls, device=self.device, dtype=torch.bool), diagonal=1 )
        
        return subsequent_mask
    
    def generate_key_padding_mask(self, seq):
        """
        Masking the padded part of the sequence.
        Input:
          seq (B, S, F) - batch of sequences.
        """
        ls = seq.size(1)
        padding_mask = seq.eq(PADDING_CONST)

        return padding_mask

    def init_temporal_encoding(self):
        """
        Initializing the internal temporal encoding tensors.
        """
        encoding_constant = torch.tensor(10000.0)

        # for better numerical stability
        self.te_div_term = torch.exp(2.0 * (torch.arange(0, self.d_model) // 2) * -torch.log(encoding_constant) / self.d_model).to(self.device)
  
    def temporal_encoding(self, t, non_padded_mask):
        """
        Function to perform the temporal encoding on input timestamps.
        Input:
          t (B, S) - batch of timestamp sequences,
          non_padded_mask (B, S) - binary mask indicating whether element is a padding (True) or not (False)
        Output:
          x (B, S, d_model) - raw model output,
          lam (B, S, F) - intensity function,
          time_pred (B, S) - timestamp prediction for the next event,
          event_pred (B, S, n_event_types) - probabilities of event types
        """
        temporal_enc = t.unsqueeze(-1) * self.te_div_term

        temporal_enc[:, :, 0::2] = torch.sin(temporal_enc[:, :, 0::2])
        temporal_enc[:, :, 1::2] = torch.cos(temporal_enc[:, :, 1::2])

        return temporal_enc * non_padded_mask.unsqueeze(-1)
    
    def log_likelihood(self, hidden_states, cond_lam, time, events, alpha, integral='mc'):
        """
        Inputs:
            hidden_states (B, S, E) - hidden states of the network,
            cond_lam (B, S, F) - conditional intensity function,
            time (B, S) - ground truth for times,
            events (B, S) - ground truth for event types,
            alpha (int) - scaling constant,
            integral (string) - method of integration: either Monte-Carlo or linear interpolation
        """
        
        src_padding_mask = self.generate_key_padding_mask(events)

        # compute event log-likelihood
        event_part = cond_lam + self.eps
        event_part.masked_fill_(src_padding_mask, 1.0)
        event_part = event_part.log()
        event_part = event_part.sum(dim=1)

        # compute non-event log-likelihood
        if integral == 'mc':
            non_event_part = compute_integral_mc(self.intensity_layer, hidden_states, cond_lam, src_padding_mask, time, events, alpha)
        else:
            non_event_part = compute_integral_li(cond_lam, time, src_padding_mask)
        non_event_part = non_event_part.sum(dim=1)

        # compute total log-likelihood
        log_likelihood = (event_part - non_event_part).sum()

        return log_likelihood

    def time_error(self, time_pred, time):
        """
        Inputs:
            time_pred (B, S) - time predictions,
            time (B, S) - ground truth for times
        """

        time_ground_truth = time[:, 1:] - time[:, :-1]
        time_pred = time_pred[:, :-1]

        time_error = nn.MSELoss(reduction='sum')(time_pred, time_ground_truth)
        return time_error

    def event_error(self, event_pred, events):
        """
        Inputs:
            event_pred (B, S, K) - event probabilities predictions,
            events (B, S) - ground truth for event types,
        """

        event_ground_truth = events[:, 1:] - 1
        event_pred = event_pred[:, :-1, :]

        event_error = nn.CrossEntropyLoss(reduction='sum', ignore_index=-1)(event_pred.transpose(1, 2), event_ground_truth)
        return event_error

    def loss_function(self, hidden_states, cond_lam, time_pred, event_pred, time, events, alpha=-0.1):
        """
        Inputs:
            hidden_states (B, S, E) - hidden states of the network,
            cond_lam (B, S, F) - conditional intensity function,
            time_pred (B, S) - time predictions,
            event_pred (B, S, K) - event probabilities predictions,
            time (B, S) - ground truth for times,
            events (B, S) - ground truth for event types,
            alpha (int) - scaling constant
        """

        # compute log-likelihood
        log_likelihood = self.log_likelihood(hidden_states, cond_lam, time, events, alpha)

        # compute timestamp forecasting error
        time_error = self.time_error(time_pred, time)

        # compute event prediction error through cross entropy loss
        event_error = self.event_error(event_pred, events)

        scale = 0.01 # for numerical stability
        loss  = -log_likelihood + event_error + time_error * scale

        return loss
    
    def forward(self, time, events):
        """
        Input:
          input_seq (B, S, F) - input sequence of size (batch size, sequence length, features)
        """

        # generate masks
        src_key_padding_mask = self.generate_key_padding_mask(events)
        src_non_padded_mask = ~src_key_padding_mask
        src_mask = self.generate_subsequent_mask(events)

        # perform encodings
        temp_enc  = self.temporal_encoding(time, src_non_padded_mask)
        event_enc = self.event_embedding(events)

        # make pass through transformer encoder layers
        h = event_enc + temp_enc
        h = self.transformer_layers(h.permute(1, 0, 2), mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        h = h.permute(1, 0, 2)

        # obtain conditional intensity function
        cond_lam = self.intensity_layer(h, events)

        # make predictions
        time_pred  = self.time_predictor(h).squeeze(2) * src_non_padded_mask
        event_pred = self.event_predictor(h) * src_non_padded_mask.unsqueeze(-1)

        return h, cond_lam, time_pred, event_pred

In [None]:
def run_epoch(model, dataloader, device, optimizer=None, metric=None):
    epoch_loss = 0.
    metric = 0.
    with torch.set_grad_enabled(optimizer is not None):
        for time, events in dataloader:
            time = time.to(device)
            events = events.to(device)

            h, cond_lam, time_pred, event_pred = model(time, events)
            #loss = model.loss_function(h, cond_lam, time_pred, event_pred, time, events)
            loss = -model.log_likelihood(h, cond_lam, time, events, -0.1, 'li') + model.event_error(event_pred, events)

            if optimizer is not None:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            epoch_loss += loss.item() / (events.shape[1] * events.shape[0])

            if metric is not None:
                metric += (event_pred.softmax(dim=2).argmax(dim=2)[:, :-1] == events[:, 1:] - 1).sum() / ((events.shape[1] - 1) * events.shape[0])
    epoch_loss /= len(dataloader)
    metric /= len(dataloader)

    return epoch_loss, metric

In [None]:
import time as t

def train(model, n_epochs, optimizer, train_loader, val_loader, scheduler=None, device=None, verbose=True, freq=None, compute_metric=False):
    if verbose and freq is None:
        freq = max(n_epochs // 10, 1)
    if device is None:
        device = GLOBAL_DEVICE

    train_loss_history, val_loss_history = [], []
    train_metric_history, val_metric_history = [], []

    time_start = t.time()
    for epoch in range(1, n_epochs + 1):
        model.train()
        train_loss, train_metric = run_epoch(model, train_loader, device, optimizer, metric=compute_metric)
        train_loss_history.append( train_loss )
        train_metric_history.append( train_metric )

        model.eval()
        val_loss, val_metric = run_epoch(model, val_loader, device, metric=compute_metric)
        val_loss_history.append( val_loss )
        val_metric_history.append( val_metric )

        if scheduler is not None:
            scheduler.step()
        
        time_epoch_end = t.time() - time_start
        if verbose and epoch % freq == 0:
            print("Epoch {}: train loss - {:.3f} | validation loss - {:.3f}".format(epoch, train_loss_history[-1], val_loss_history[-1]))
            if compute_metric > 0:
                print("train accuracy - {:.3f} | validation accuracy - {:.3f}".format(train_metric_history[-1], val_metric_history[-1]))
            print("Time elapsed: {:.2f} s".format(time_epoch_end))

    return train_loss_history, val_loss_history, train_metric_history, val_metric_history

In [None]:
!unzip -q fin_data.zip -d data

In [None]:
import pickle

class NHPDataset(utils_data.Dataset):
    ''' 
    Create Dataset for Neural Hawkey Process
    '''

    def __init__(self, file_path):
        self.event_type = []
        self.event_time = []

        with open(file_path, 'rb') as f:

            if 'dev' in file_path:
                seqs = pickle.load(f, encoding='latin1')['dev']
            elif 'train' in file_path:
                seqs = pickle.load(f, encoding='latin1')['train']
            elif 'test' in file_path:
                seqs = pickle.load(f, encoding='latin1')['test']

            for idx, seq in enumerate(seqs):
                self.event_type.append(torch.Tensor([int(event['type_event']) for event in seq]))
                self.event_time.append(torch.Tensor([float(event['time_since_last_event']) for event in seq]))

    def __len__(self):
        return len(self.event_type)
    
    def __getitem__(self, index):

        event_type = torch.LongTensor(self.event_type[index].long() + 1)
        event_time = torch.Tensor(self.event_time[index])
        seq_len = torch.tensor(len(event_type))
        event_last_time = torch.sum(event_time)

        X = torch.stack((event_time, event_type), dim=1)
        
        return event_time, event_type

In [None]:
train_dataset = NHPDataset('data/train.pkl')
val_dataset = NHPDataset('data/dev.pkl')
test_dataset = NHPDataset('data/test.pkl')

train_loader = utils_data.DataLoader(train_dataset, batch_size=5, shuffle=True)
val_loader = utils_data.DataLoader(val_dataset, batch_size=5, shuffle=False)
test_loader = utils_data.DataLoader(test_dataset, batch_size=5, shuffle=False)

In [None]:
model = HawkesTransformer(2, GLOBAL_DEVICE, 512, 4, 1024, 4, 0.1, 'gelu').to(GLOBAL_DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), eps=1e-05)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.5)

train_loss_hist, val_loss_hist, train_metric_hist, val_metric_hist = train(model, 50, optimizer, train_loader, val_loader, scheduler, GLOBAL_DEVICE, freq=1, compute_metric=True)

Epoch 1: train loss - -2.110 | validation loss - -3.519
train accuracy - 0.591 | validation accuracy - 0.584
Time elapsed: 10.65 s
Epoch 2: train loss - -8.328 | validation loss - -4.347
train accuracy - 0.615 | validation accuracy - 0.599
Time elapsed: 21.29 s
Epoch 3: train loss - -9.765 | validation loss - -4.820
train accuracy - 0.620 | validation accuracy - 0.599
Time elapsed: 31.92 s
Epoch 4: train loss - -10.620 | validation loss - -5.470
train accuracy - 0.622 | validation accuracy - 0.599
Time elapsed: 42.55 s
Epoch 5: train loss - -11.255 | validation loss - -5.367
train accuracy - 0.622 | validation accuracy - 0.599
Time elapsed: 53.18 s
Epoch 6: train loss - -11.717 | validation loss - -5.602
train accuracy - 0.619 | validation accuracy - 0.599
Time elapsed: 63.81 s
Epoch 7: train loss - -12.767 | validation loss - -5.816
train accuracy - 0.620 | validation accuracy - 0.599
Time elapsed: 74.45 s
Epoch 8: train loss - -13.414 | validation loss - -6.154
train accuracy - 0.621

In [None]:
torch.save(model.state_dict(), 'model.pth')

In [None]:
model = HawkesTransformer(2, GLOBAL_DEVICE)
model.load_state_dict(torch.load('model.pth'))
model.to(GLOBAL_DEVICE)

In [None]:
loss, metrics = run_epoch(model, val_loader, GLOBAL_DEVICE, metric=True)

In [None]:
loss

-9.863696708345888