# AMEX CosSquareFormer Starter

Based on our work in a soon to be published paper (https://github.com/mayukh18/DEAP)

Also special thanks to Chris Deotte's wonderful notebooks, especially https://www.kaggle.com/code/cdeotte/tensorflow-gru-starter-0-790/notebook, which formed a portion of the skeleton of this notebook.

In [None]:
# all imports here
import os
import sys
import glob
import math
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


import gc
import sklearn
from sklearn.metrics import accuracy_score, f1_score
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import feature_selection
from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import _LRScheduler
from torch.autograd import Variable

from datetime import datetime
from tqdm import tqdm
from copy import deepcopy

import warnings
warnings.filterwarnings('ignore')

device = 'cuda'

pd.set_option("display.max_columns", None)
plt.style.use("ggplot")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]

## Setting up the standard scaler

In [None]:
scaler = StandardScaler()

x = []
for i in range(2):
    x.append(np.load(f'../input/amex-data-for-transformers-and-rnns/data/data_{i+1}.npy'))
x = np.concatenate(x, axis=0)
scaler.fit(np.reshape(x, (-1, 188)))

## Competition Metric

In [None]:
def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

## Transformer

In [None]:
class MultiHeadAttention(nn.Module):
    '''Multi-head self-attention module'''
    def __init__(self, D, H):
        super(MultiHeadAttention, self).__init__()
        self.H = H # number of heads
        self.D = D # dimension
        
        self.wq = nn.Linear(D, D*H)
        self.wk = nn.Linear(D, D*H)
        self.wv = nn.Linear(D, D*H)

        self.dense = nn.Linear(D*H, D)

    def concat_heads(self, x):
        '''(B, H, S, D) => (B, S, D*H)'''
        B, H, S, D = x.shape
        x = x.permute((0, 2, 1, 3)).contiguous()  # (B, S, H, D)
        x = x.reshape((B, S, H*D))   # (B, S, D*H)
        return x

    def split_heads(self, x):
        '''(B, S, D*H) => (B, H, S, D)'''
        B, S, D_H = x.shape
        x = x.reshape(B, S, self.H, self.D)    # (B, S, H, D)
        x = x.permute((0, 2, 1, 3))  # (B, H, S, D)
        return x

    def forward(self, x, mask):

        q = self.wq(x)  # (B, S, D*H)
        k = self.wk(x)  # (B, S, D*H)
        v = self.wv(x)  # (B, S, D*H)

        q = self.split_heads(q)  # (B, H, S, D)
        k = self.split_heads(k)  # (B, H, S, D)
        v = self.split_heads(v)  # (B, H, S, D)

        attention_scores = torch.matmul(q, k.transpose(-1, -2)) #(B,H,S,S)
        attention_scores = attention_scores / math.sqrt(self.D)

        # add the mask to the scaled tensor.
        if mask is not None:
            attention_scores += (mask * -1e9)
        
        attention_weights = nn.Softmax(dim=-1)(attention_scores)
        scaled_attention = torch.matmul(attention_weights, v)  # (B, H, S, D)
        concat_attention = self.concat_heads(scaled_attention) # (B, S, D*H)
        output = self.dense(concat_attention)  # (B, S, D)

        return output, attention_weights

class MultiHeadAttention(nn.Module):
    '''Multi-head self-attention module'''
    def __init__(self, D, H):
        super(MultiHeadAttention, self).__init__()
        self.H = H # number of heads
        self.D = D # dimension
        
        self.wq = nn.Linear(D, D*H)
        self.wk = nn.Linear(D, D*H)
        self.wv = nn.Linear(D, D*H)

        self.dense = nn.Linear(D*H, D)

    def concat_heads(self, x):
        '''(B, H, S, D) => (B, S, D*H)'''
        B, H, S, D = x.shape
        x = x.permute((0, 2, 1, 3)).contiguous()  # (B, S, H, D)
        x = x.reshape((B, S, H*D))   # (B, S, D*H)
        return x

    def split_heads(self, x):
        '''(B, S, D*H) => (B, H, S, D)'''
        B, S, D_H = x.shape
        x = x.reshape(B, S, self.H, self.D)    # (B, S, H, D)
        x = x.permute((0, 2, 1, 3))  # (B, H, S, D)
        return x

    def forward(self, x, mask):

        q = self.wq(x)  # (B, S, D*H)
        k = self.wk(x)  # (B, S, D*H)
        v = self.wv(x)  # (B, S, D*H)

        q = self.split_heads(q)  # (B, H, S, D)
        k = self.split_heads(k)  # (B, H, S, D)
        v = self.split_heads(v)  # (B, H, S, D)

        attention_scores = torch.matmul(q, k.transpose(-1, -2)) #(B,H,S,S)
        attention_scores = attention_scores / math.sqrt(self.D)

        # add the mask to the scaled tensor.
        if mask is not None:
            attention_scores += (mask * -1e9)
        
        attention_weights = nn.Softmax(dim=-1)(attention_scores)
        scaled_attention = torch.matmul(attention_weights, v)  # (B, H, S, D)
        concat_attention = self.concat_heads(scaled_attention) # (B, S, D*H)
        output = self.dense(concat_attention)  # (B, S, D)

        return output, attention_weights

class MultiHeadAttentionCosformerNew(nn.Module):
    '''Multi-head self-attention module'''
    def __init__(self, D, H):
        super(MultiHeadAttentionCosformerNew, self).__init__()
        self.H = H # number of heads
        self.D = D # dimension
        
        self.wq = nn.Linear(D, D*H)
        self.wk = nn.Linear(D, D*H)
        self.wv = nn.Linear(D, D*H)

        self.dense = nn.Linear(D*H, D)

    def concat_heads(self, x):
        '''(B, H, S, D) => (B, S, D*H)'''
        B, H, S, D = x.shape
        x = x.permute((0, 2, 1, 3)).contiguous()  # (B, S, H, D)
        x = x.reshape((B, S, H*D))   # (B, S, D*H)
        return x

    def split_heads(self, x):
        '''(B, S, D*H) => (B, H, S, D)'''
        B, S, D_H = x.shape
        x = x.reshape(B, S, self.H, self.D)    # (B, S, H, D)
        x = x.permute((0, 2, 1, 3))  # (B, H, S, D)
        return x

    def forward(self, x, mask):

        q = self.wq(x)  # (B, S, D*H)
        k = self.wk(x)  # (B, S, D*H)
        v = self.wv(x)  # (B, S, D*H)

        q = self.split_heads(q).permute(0,2,1,3)  # (B, S, H, D)
        k = self.split_heads(k).permute(0,2,1,3)  # (B, S, H, D)
        v = self.split_heads(v).permute(0,2,1,3)  # (B, S, H, D)
        B = q.shape[0]
        S = q.shape[1]

        q = torch.nn.functional.elu(q) + 1 # Sigmoid torch.nn.ReLU()
        k = torch.nn.functional.elu(k) + 1 # Sigmoid torch.nn.ReLU()

        # q, k, v -> [batch_size, seq_len, n_heads, d_head]
        cos = (torch.cos(1.57*torch.arange(S)/S).unsqueeze(0)).repeat(B,1).to(device)
        sin = (torch.sin(1.57*torch.arange(S)/S).unsqueeze(0)).repeat(B,1).to(device)
        # cos, sin -> [batch_size, seq_len]
        q_cos = torch.einsum('bsnd,bs->bsnd', q, cos)
        q_sin = torch.einsum('bsnd,bs->bsnd', q, sin)
        k_cos = torch.einsum('bsnd,bs->bsnd', k, cos)
        k_sin = torch.einsum('bsnd,bs->bsnd', k, sin)
        # q_cos, q_sin, k_cos, k_sin -> [batch_size, seq_len, n_heads, d_head]

        kv_cos = torch.einsum('bsnx,bsnz->bnxz', k_cos, v)
        # kv_cos -> [batch_size, n_heads, d_head, d_head]
        qkv_cos = torch.einsum('bsnx,bnxz->bsnz', q_cos, kv_cos)
        # qkv_cos -> [batch_size, seq_len, n_heads, d_head]

        kv_sin = torch.einsum('bsnx,bsnz->bnxz', k_sin, v)
        # kv_sin -> [batch_size, n_heads, d_head, d_head]
        qkv_sin = torch.einsum('bsnx,bnxz->bsnz', q_sin, kv_sin)
        # qkv_sin -> [batch_size, seq_len, n_heads, d_head]

        # denominator
        denominator = 1.0 / (torch.einsum('bsnd,bnd->bsn', q_cos, k_cos.sum(axis=1))
                             + torch.einsum('bsnd,bnd->bsn',
                                            q_sin, k_sin.sum(axis=1))
                             + 1e-5)
        # denominator -> [batch_size, seq_len, n_heads]

        O = torch.einsum('bsnz,bsn->bsnz', qkv_cos +
                              qkv_sin, denominator).contiguous()
        # output -> [batch_size, seq_len, n_heads, d_head]

        concat_attention = self.concat_heads(O.permute(0,2,1,3)) # (B, S, D*H)
        output = self.dense(concat_attention)  # (B, S, D)

        return output, None

class MultiHeadAttentionCosSquareformerNew(nn.Module):
    '''Multi-head self-attention module'''
    def __init__(self, D, H):
        super(MultiHeadAttentionCosSquareformerNew, self).__init__()
        self.H = H # number of heads
        self.D = D # dimension
        
        self.wq = nn.Linear(D, D*H)
        self.wk = nn.Linear(D, D*H)
        self.wv = nn.Linear(D, D*H)

        self.dense = nn.Linear(D*H, D)

    def concat_heads(self, x):
        '''(B, H, S, D) => (B, S, D*H)'''
        B, H, S, D = x.shape
        x = x.permute((0, 2, 1, 3)).contiguous()  # (B, S, H, D)
        x = x.reshape((B, S, H*D))   # (B, S, D*H)
        return x

    def split_heads(self, x):
        '''(B, S, D*H) => (B, H, S, D)'''
        B, S, D_H = x.shape
        x = x.reshape(B, S, self.H, self.D)    # (B, S, H, D)
        x = x.permute((0, 2, 1, 3))  # (B, H, S, D)
        return x

    def forward(self, x, mask):

        q = self.wq(x)  # (B, S, D*H)
        k = self.wk(x)  # (B, S, D*H)
        v = self.wv(x)  # (B, S, D*H)

        q = self.split_heads(q).permute(0,2,1,3)  # (B, S, H, D)
        k = self.split_heads(k).permute(0,2,1,3)  # (B, S, H, D)
        v = self.split_heads(v).permute(0,2,1,3)  # (B, S, H, D)
        B = q.shape[0]
        S = q.shape[1]

        q = torch.nn.functional.elu(q) + 1 # Sigmoid torch.nn.ReLU()
        k = torch.nn.functional.elu(k) + 1 # Sigmoid torch.nn.ReLU()

        # q, k, v -> [batch_size, seq_len, n_heads, d_head]
        cos = (torch.cos(3.1415*torch.arange(S)/S).unsqueeze(0)).repeat(B,1).to(device)
        sin = (torch.sin(3.1415*torch.arange(S)/S).unsqueeze(0)).repeat(B,1).to(device)
        # cos, sin -> [batch_size, seq_len]
        q_cos = torch.einsum('bsnd,bs->bsnd', q, cos)
        q_sin = torch.einsum('bsnd,bs->bsnd', q, sin)
        k_cos = torch.einsum('bsnd,bs->bsnd', k, cos)
        k_sin = torch.einsum('bsnd,bs->bsnd', k, sin)
        # q_cos, q_sin, k_cos, k_sin -> [batch_size, seq_len, n_heads, d_head]

        kv_cos = torch.einsum('bsnx,bsnz->bnxz', k_cos, v)
        # kv_cos -> [batch_size, n_heads, d_head, d_head]
        qkv_cos = torch.einsum('bsnx,bnxz->bsnz', q_cos, kv_cos)
        # qkv_cos -> [batch_size, seq_len, n_heads, d_head]

        kv_sin = torch.einsum('bsnx,bsnz->bnxz', k_sin, v)
        # kv_sin -> [batch_size, n_heads, d_head, d_head]
        qkv_sin = torch.einsum('bsnx,bnxz->bsnz', q_sin, kv_sin)
        # qkv_sin -> [batch_size, seq_len, n_heads, d_head]

        kv = torch.einsum('bsnx,bsnz->bnxz', k, v)
        # kv -> [batch_size, n_heads, d_head, d_head]
        qkv = torch.einsum('bsnx,bnxz->bsnz', q, kv)
        # qkv_cos -> [batch_size, seq_len, n_heads, d_head]

        # denominator
        denominator = 1.0 / (torch.einsum('bsnd,bnd->bsn', q, k.sum(axis=1)) + torch.einsum('bsnd,bnd->bsn', q_cos, k_cos.sum(axis=1))
                             + torch.einsum('bsnd,bnd->bsn',
                                            q_sin, k_sin.sum(axis=1))
                             + 1e-5)
        # denominator -> [batch_size, seq_len, n_heads]

        O = torch.einsum('bsnz,bsn->bsnz', qkv + qkv_cos +
                              qkv_sin, denominator).contiguous()
        # output -> [batch_size, seq_len, n_heads, d_head]

        concat_attention = self.concat_heads(O.permute(0,2,1,3)) # (B, S, D*H)
        output = self.dense(concat_attention)  # (B, S, D)

        return output, None

# Positional encodings
def get_angles(pos, i, D):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(D))
    return pos * angle_rates


def positional_encoding(D, position=20, dim=3, device=device):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(D)[np.newaxis, :],
                            D)
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    if dim == 3:
        pos_encoding = angle_rads[np.newaxis, ...]
    elif dim == 4:
        pos_encoding = angle_rads[np.newaxis,np.newaxis,  ...]
    return torch.tensor(pos_encoding, device=device)

class TransformerLayer(nn.Module):
    def __init__(self, D, H, hidden_mlp_dim, dropout_rate, attention_type='cosine_square'):
        super(TransformerLayer, self).__init__()
        self.dropout_rate = dropout_rate
        self.mlp_hidden = nn.Linear(D, hidden_mlp_dim)
        self.mlp_out = nn.Linear(hidden_mlp_dim, D)
        self.layernorm1 = nn.LayerNorm(D, eps=1e-9)
        self.layernorm2 = nn.LayerNorm(D, eps=1e-9)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.dropout2 = nn.Dropout(dropout_rate)

        if attention_type == 'cosine':
          self.mha = MultiHeadAttentionCosformerNew(D, H)
        elif attention_type == 'cosine_square':
          self.mha = MultiHeadAttentionCosSquareformerNew(D, H)
        else:
          self.mha = MultiHeadAttention(D,H)

    def forward(self, x, look_ahead_mask):
        
        attn, attn_weights = self.mha(x, look_ahead_mask)  # (B, S, D)
        attn = self.dropout1(attn) # (B,S,D)
        attn = self.layernorm1(attn + x) # (B,S,D)

        mlp_act = torch.relu(self.mlp_hidden(attn))
        mlp_act = self.mlp_out(mlp_act)
        mlp_act = self.dropout2(mlp_act)
        
        output = self.layernorm2(mlp_act + attn)  # (B, S, D)

        return output, attn_weights
  
class Transformer(nn.Module):
    '''Transformer Decoder Implementating several Decoder Layers.
    '''
    def __init__(self, num_layers, D, H, hidden_mlp_dim, inp_features, out_features, dropout_rate, attention_type='cosine_square'):
        super(Transformer, self).__init__()
        self.attention_type = attention_type
        self.sqrt_D = torch.tensor(math.sqrt(D))
        self.num_layers = num_layers
        self.input_projection = nn.Linear(inp_features, D) # multivariate input
        self.output_projection = nn.Linear(D, 96) # multivariate output
        self.output_projection1 = nn.Linear(96, 16)
        self.output_projection2 = nn.Linear(16, out_features)
        self.pos_encoding = positional_encoding(D)
        self.dec_layers = nn.ModuleList([TransformerLayer(D, H, hidden_mlp_dim, 
                                        dropout_rate=dropout_rate, attention_type=self.attention_type
                                       ) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout_rate)
        self.sig = torch.nn.Sigmoid()

    def forward(self, x, mask):
        B, S, D = x.shape
        # attention_weights = {}
        x = self.input_projection(x)
        x *= self.sqrt_D
        
        x += self.pos_encoding[:, :S, :]

        x = self.dropout(x)

        for i in range(self.num_layers):
            x, _ = self.dec_layers[i](x=x,
                                          look_ahead_mask=mask)
            # attention_weights['decoder_layer{}'.format(i + 1)] = block
        
        x = self.output_projection(x)
        x = self.output_projection1(x)
        x = self.output_projection2(x)
        
        #return self.sig(x), None # attention_weights # (B,S,S)
        return self.sig(x)


In [None]:
# function that implement the look_ahead mask for masking future time steps. 
def create_look_ahead_mask(size, device=device):
    mask = torch.ones((size, size), device=device)
    mask = torch.triu(mask, diagonal=1)
    return mask  # (size, size)


<!-- rfe_selector.get_support()
rfe_selector.get_support()
array([ True, False,  True,  True,  True,  True,  True,  True, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
       False,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True, False,  True,  True, False,  True,
        True,  True,  True, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True, False,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True,  True, False,  True,  True,  True,  True]) -->

In [None]:
from sklearn.model_selection import KFold

train_ids = []
val_ids = []

kf = KFold(shuffle=True, random_state=42)
for x in kf.split(list(range(1,11))):
    train_ids.append(x[0] + 1)
    val_ids.append(x[1] + 1)
    
train_ids, val_ids

## Training

In [None]:
#cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
DATA_PATH = '../input/amex-data-for-transformers-and-rnns'

lr = 0.0001
n_epochs = 16
BATCH_SIZE = 128
N_FEATS = 188


sys.stderr.flush()
gc.collect()

In [None]:
print('Start model training')

criterion = nn.BCELoss(reduction='none')
FOLDS = list(range(5))

for fold in FOLDS:
    print(f'============================= | Fold: {fold} |=============================')
    
    model = Transformer(num_layers=1, D=96, H=8, hidden_mlp_dim=3072, inp_features=3*N_FEATS, out_features=1, dropout_rate=0.25, attention_type='cosine_square').to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    sched = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.8)

    best_score = 0.0  
    for epoch in range(n_epochs):
        epoch_loss = 0
        batch_idx = 0
        model.train()

        bar = tqdm(train_ids[fold])
        for i in bar:
            x_chunk = np.load(f'{DATA_PATH}/data/data_{i}.npy')
            x_chunk = scaler.transform(np.reshape(x_chunk, (-1, N_FEATS)))
            x_chunk = np.reshape(x_chunk, (-1, 13, N_FEATS))

            x_chunk_mean = np.mean(x_chunk, axis=0, keepdims=True)
            x_chunk_mean = np.repeat(x_chunk_mean, repeats=x_chunk.shape[0], axis=0)
            
            x_chunk = np.concatenate([x_chunk, x_chunk_mean, x_chunk - x_chunk_mean], axis=-1)
            y_chunk = np.array(pd.read_parquet(f'{DATA_PATH}/data/targets_{i}.pqt')['target'])

            batches = x_chunk.shape[0]//BATCH_SIZE
            for b in range(batches):
                start = b*BATCH_SIZE

                opt.zero_grad()
                x = torch.tensor(x_chunk[start: start+BATCH_SIZE, :]).float().to(device)
                y = torch.tensor(y_chunk[start: start+BATCH_SIZE]).float().to(device)

                mask = create_look_ahead_mask(x.shape[1]).to(device)
                out = model(x, mask)

                loss = criterion(out[:,-1,:], y.unsqueeze(1))
                loss = torch.mean(loss)

                epoch_loss = (epoch_loss*batch_idx + loss.item())/(batch_idx+1)
                loss.backward()
                opt.step()

                bar.set_description(f'Train Id: {i}, Loss: {str(epoch_loss)}')
                batch_idx += 1

        sched.step()
        model.eval()
        all_true = []
        all_pred = []

        print(f'>> Epoch {epoch}: Validation')
        for i in val_ids[fold]:
            x_chunk = np.load(f'{DATA_PATH}/data/data_{i}.npy')
            x_chunk = scaler.transform(np.reshape(x_chunk, (-1, N_FEATS)))
            x_chunk = np.reshape(x_chunk, (-1, 13, N_FEATS))

            x_chunk_mean = np.mean(x_chunk, axis=0, keepdims=True)
            x_chunk_mean = np.repeat(x_chunk_mean, repeats=x_chunk.shape[0], axis=0)
            
            x_chunk = np.concatenate([x_chunk, x_chunk_mean, x_chunk - x_chunk_mean], axis=-1)
            y_chunk = np.array(pd.read_parquet(f'{DATA_PATH}/data/targets_{i}.pqt')['target'])

            batches = x_chunk.shape[0]//BATCH_SIZE
            
            with torch.no_grad():
                for b in range(batches):
                    start = b*BATCH_SIZE

                    opt.zero_grad()
                    x = torch.tensor(x_chunk[start: start+BATCH_SIZE, :]).float().to(device)
                    y = torch.tensor(y_chunk[start: start+BATCH_SIZE]).float().to(device)

                    mask = create_look_ahead_mask(x.shape[1]).to(device)
                    out = model(x, mask)

                    ytrue = y.squeeze().cpu().numpy()
                    ypred = out[:,-1,0].squeeze().cpu().numpy()
                    all_true.extend(ytrue)
                    all_pred.extend(ypred)

        all_true = np.array(all_true)
        all_pred = np.array(all_pred)
        
        epoch_score = amex_metric_numpy(all_true, all_pred)
        print('>> Eval Amex: {}: '.format(epoch_score))

        if epoch_score > best_score:
          best_score = epoch_score
          print(">> Saving model ...")
          torch.save(model.state_dict(), f'model_cossqformer_fold_{fold}')
        
        
    

In [None]:
del model
del all_true, all_pred
gc.collect()

## Inference

In [None]:
sub = pd.read_csv(f'../input/amex-default-prediction/sample_submission.csv')
sub['hash'] = sub['customer_ID'].str[-16:].apply(lambda x: int(x, 16)).astype('int64')
test_hash_index = np.load(f'{DATA_PATH}/data/test_hashes_data.npy')
sub = sub.set_index('hash').loc[test_hash_index].reset_index(drop=True)

all_preds = np.zeros(len(sub))

for fold in FOLDS:
    model = Transformer(num_layers=1, D=96, H=8, hidden_mlp_dim=3072, inp_features=3*N_FEATS, out_features=1, dropout_rate=0.2, attention_type='cosine_square').to(device)
    model.load_state_dict(torch.load(f'model_cossqformer_fold_{fold}'))
    model.eval()
    
    fold_preds = []
    
    print(f'Fold ID {fold}')
    
    bar = tqdm(range(1,21))
    for i in bar:
        x_chunk = np.load(f'{DATA_PATH}/data/test_data_{i}.npy')
        x_chunk = scaler.transform(np.reshape(x_chunk, (-1, N_FEATS)))
        x_chunk = np.reshape(x_chunk, (-1, 13, N_FEATS))

        x_chunk_mean = np.mean(x_chunk, axis=0, keepdims=True)
        x_chunk_mean = np.repeat(x_chunk_mean, repeats=x_chunk.shape[0], axis=0)
        
        x_chunk = np.concatenate([x_chunk, x_chunk_mean, x_chunk - x_chunk_mean], axis=-1)

        batches = x_chunk.shape[0]//BATCH_SIZE + int((x_chunk.shape[0] % BATCH_SIZE) > 0)
        chunk_preds = []
        
        with torch.no_grad():
            for b in range(batches):
                start = b*BATCH_SIZE
                x = torch.tensor(x_chunk[start: min(start+BATCH_SIZE, len(x_chunk)), :]).float().to(device)\

                mask = create_look_ahead_mask(x.shape[1]).to(device)
                out = model(x, mask)
                ypred = out[:,-1,0].squeeze().cpu().numpy()
                chunk_preds.extend(ypred)

        fold_preds.extend(chunk_preds)
    
    all_preds += np.array(fold_preds)
        
all_preds = all_preds / 5
sub['prediction'] = all_preds

In [None]:
sub.to_csv("submission.csv", index=False)
sub.head()