In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import partial

from fastai.tabular import *
from fastai.callbacks import SaveModelCallback
from fastai.basic_data import DataBunch

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import copy

## Define some constants

In [2]:
FOLD_ID = 1
VERSION = 1

TYPES              = np.array(['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN'])
TYPES_MAP          = {t: i for i, t in enumerate(TYPES)}
SC_EDGE_FEATS      = ['type_0', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6', 'type_7', 
                      'dist', 'dist_min_rad', 'dist_electro_neg_adj', 'normed_dist', 
                      'diangle', 'cos_angle', 'cos_angle0', 'cos_angle1', 
                      #'inv_dist', 'normed_inv_dist'
                     ]
SC_MOL_FEATS       = ['type_0', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6', 'type_7', 
                      'dist', 'dist_min_rad', 'dist_electro_neg_adj', 'normed_dist', 
                      'diangle', 'cos_angle', 'cos_angle0', 'cos_angle1', 
                      'num_atoms', 'num_C_atoms', 'num_F_atoms', 'num_H_atoms', 'num_N_atoms', 'num_O_atoms', 
                      #'inv_dist', 'normed_inv_dist', 
                      'std_bond_length', 'ave_bond_length', #'total_bond_length',  
                      #'ave_inv_bond_length', 'total_inv_bond_length', 
                      'ave_atom_weight'#, 'total_atom_weight'
                     ]
ATOM_FEATS         = ['type_H', 'type_C', 'type_N', 'type_O', 'type_F', 
                      'degree_1', 'degree_2', 'degree_3', 'degree_4', 'degree_5', 
                      'SP', 'SP2', 'SP3', 'hybridization_unspecified', 
                      'aromatic', 'formal_charge', 'atomic_num',
                      'donor', 'acceptor', 
                      'ave_bond_length', 
                      #'ave_inv_bond_length',
                      'ave_neighbor_weight']
EDGE_FEATS         = ['single', 'double', 'triple', 'aromatic', 
                      'conjugated', 'in_ring',
                      'dist', 'normed_dist', 
                      #'inv_dist', 'normed_inv_dist'
                     ]
TARGET_COL         = 'scalar_coupling_constant'
CONTRIB_COLS       = ['fc', 'sd', 'pso', 'dso']
MAX_N_ATOMS        = 29
N_EDGE_FEATURES    = len(EDGE_FEATS)
N_SC_EDGE_FEATURES = len(SC_EDGE_FEATS)
N_SC_MOL_FEATURES  = len(SC_MOL_FEATS)
N_ATOM_FEATURES    = len(ATOM_FEATS)
N_TYPES            = len(TYPES)
N_MOLS             = 130775
SC_MEAN            = 16
SC_STD             = 35

SC_FEATS_TO_SCALE   = ['dist', 'dist_min_rad', 'dist_electro_neg_adj', 'num_atoms', 'num_C_atoms', 
                       'num_F_atoms', 'num_H_atoms', 'num_N_atoms', 'num_O_atoms', 'inv_dist', 
                       'ave_bond_length', 'std_bond_length', 'total_bond_length',  'ave_inv_bond_length', 
                       'total_inv_bond_length', 'ave_atom_weight', 'total_atom_weight']
ATOM_FEATS_TO_SCALE = ['atomic_num', 'ave_bond_length', 'ave_inv_bond_length', 'ave_neighbor_weight']
EDGE_FEATS_TO_SCALE = ['dist', 'inv_dist']

## Import data

In [3]:
DATA_PATH = '../data/'
PATH = '../tmp/'
CV_IDXS_PATH = PATH
GRAPH_PATH = PATH
# DATA_PATH = '../input/champs-scalar-coupling/'
# PATH = '../input/champs-processed-data-3/'
# CV_IDXS_PATH = '../input/champs-cv-8-fold-idxs/'
# GRAPH_PATH = '../input/champs-graph-dists/'

In [4]:
def show_csv_files(path):
    files = os.listdir(path)
    files = [f for f in files if f.find('.csv') != -1]
    print(f'{path}:', files)
show_csv_files(PATH)
show_csv_files(DATA_PATH)
# show_csv_files(CV_IDXS_PATH)

../tmp/: ['atomic_features.csv', 'angle_out_df.csv', 'train_proc_df.csv', 'graph_dist_df.csv', 'mask.csv', 'train_idxs_8_fold_cv.csv', 'edge_mask.csv', 'atom_df.csv', 'pairs_idx.csv', 'edge_df.csv', 'train_idxs_4_fold_cv.csv', 'edge_features.csv', 'dist_df.csv', 'angle_in_df.csv', 'angle_df.csv', 'val_idxs_8_fold_cv.csv', 'val_idxs_4_fold_cv.csv', 'test_proc_df.csv']
../data/: ['scalar_coupling_contributions.csv', 'mulliken_charges.csv', 'structures.csv', 'test.csv', 'train.csv', 'magnetic_shielding_tensors.csv', 'dipole_moments.csv', 'sample_submission.csv', 'potential_energy.csv']


In [5]:
train_df = pd.read_csv(PATH+'train_proc_df.csv', index_col=0)
test_df  = pd.read_csv(PATH+'test_proc_df.csv', index_col=0)
atom_df  = pd.read_csv(PATH+'atom_df.csv', index_col=0)
edge_df  = pd.read_csv(PATH+'edge_df.csv', index_col=0)
angle_in_df  = pd.read_csv(PATH+'angle_in_df.csv', index_col=0)
angle_out_df = pd.read_csv(PATH+'angle_out_df.csv', index_col=0)
graph_dist_df = pd.read_csv(GRAPH_PATH+'graph_dist_df.csv', index_col=0, dtype=np.int32)

structures_df = pd.read_csv(DATA_PATH+'structures.csv')
mol_id_map = {m_name: m_id for m_id, m_name in enumerate(sorted(structures_df['molecule_name'].unique()))}
structures_df['molecule_id'] = structures_df['molecule_name'].map(mol_id_map)

train_mol_ids = pd.read_csv(CV_IDXS_PATH+'train_idxs_8_fold_cv.csv', usecols=[0, FOLD_ID], index_col=0).dropna().astype(int).iloc[:,0]
val_mol_ids   = pd.read_csv(CV_IDXS_PATH+'val_idxs_8_fold_cv.csv', usecols=[0, FOLD_ID], index_col=0).dropna().astype(int).iloc[:,0]
test_mol_ids  = pd.Series(test_df['molecule_id'].unique())

contribs_df = pd.read_csv(DATA_PATH+'scalar_coupling_contributions.csv')
train_df = pd.concat((train_df, contribs_df[CONTRIB_COLS]), axis=1)
del contribs_df
gc.collect()

train_df[[TARGET_COL, 'fc']] = (train_df[[TARGET_COL, 'fc']] - SC_MEAN) / SC_STD
train_df[CONTRIB_COLS[1:]] = train_df[CONTRIB_COLS[1:]] / SC_STD

train_df['num_atoms'] = train_df[['num_C_atoms', 'num_F_atoms', 'num_H_atoms', 
                                  'num_N_atoms', 'num_O_atoms']].sum(axis=1)
test_df['num_atoms'] = test_df[['num_C_atoms', 'num_F_atoms', 'num_H_atoms', 
                                'num_N_atoms', 'num_O_atoms']].sum(axis=1)
train_df[['num_atoms', 'num_C_atoms', 'num_F_atoms', 'num_H_atoms', 'num_N_atoms', 'num_O_atoms']] /= 10
test_df[['num_atoms', 'num_C_atoms', 'num_F_atoms', 'num_H_atoms', 'num_N_atoms', 'num_O_atoms']] /= 10

  mask |= (ar1 == a)


In [6]:
train_df.head()

Unnamed: 0,atom_0,atom_1,atom_index_0,atom_index_1,cos_angle,cos_angle0,cos_angle1,diangle,dist,dist_electro_neg_adj,...,std_bond_length,total_bond_length,ave_inv_bond_length,total_inv_bond_length,ave_atom_weight,total_atom_weight,fc,sd,pso,dso
0,H,C,1,0,0.0,0.0,-0.333335,0.0,1.091953,2.593389,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,1.914926,0.007274,0.035961,0.007772
1,H,H,1,2,-0.333287,0.816483,0.816482,0.0,1.78312,3.922863,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,-0.77242,0.010085,0.081668,-0.098103
2,H,H,1,3,-0.333335,0.816498,0.816496,0.0,1.783147,3.922924,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,-0.772357,0.010084,0.081672,-0.098111
3,H,H,1,4,-0.333347,0.816502,0.8165,0.0,1.783157,3.922945,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,-0.77234,0.010084,0.081673,-0.098112
4,H,C,2,0,0.0,0.0,-0.333352,0.0,1.091952,2.593385,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,1.91492,0.007274,0.03596,0.007772


In [7]:
test_df.head()

Unnamed: 0,atom_0,atom_1,atom_index_0,atom_index_1,cos_angle,cos_angle0,cos_angle1,diangle,dist,dist_electro_neg_adj,...,type_7,inv_dist,normed_inv_dist,ave_bond_length,std_bond_length,total_bond_length,ave_inv_bond_length,total_inv_bond_length,ave_atom_weight,total_atom_weight
4658147,H,C,2,0,-1.0,1.0,-1.0,0.0,2.261178,5.370298,...,0,0.442247,-0.815269,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4
4658148,H,C,2,1,0.0,0.0,-1.0,0.0,1.062099,2.522485,...,0,0.941532,4.621731,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4
4658149,H,H,2,3,0.0,1.0,1.0,0.0,3.323277,7.31121,...,0,0.300908,-2.038452,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4
4658150,H,C,3,0,0.0,0.0,-1.0,0.0,1.062099,2.522485,...,0,0.941532,4.621731,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4
4658151,H,C,3,1,-1.0,1.0,-1.0,0.0,2.261178,5.370298,...,0,0.442247,-0.815269,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4


## Define Molecule Transformer

In [8]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

def unstack_batch(x, mask, exp_idx, n_cols):
    x = x.index_select(0, exp_idx)
    x[mask[:,0]==0] = 0
    sz = x.size()
    if len(sz) <= 2: return x.view(-1, MAX_N_ATOMS, n_cols)
    else: return x.view(-1, MAX_N_ATOMS, n_cols, *sz[2:])

def stack_batch(x, mask, n_cols):
    return x.view(-1, n_cols)[mask[:,0]==1]

In [9]:
def bn_init(m):
    if isinstance(m, nn.BatchNorm1d):
        nn.init.uniform_(m.weight)

def hidden_layer(n_in, n_out, batch_norm, dropout, layer_norm=False, act=None):
    layers = []
    layers.append(nn.Linear(n_in, n_out))
    if act: layers.append(act)
    if batch_norm: layers.append(nn.BatchNorm1d(n_out))
    if layer_norm: layers.append(nn.LayerNorm(n_out))
    if dropout != 0: layers.append(nn.Dropout(dropout))
    return layers

class FullyConnectedNet(nn.Module):
    
    def __init__(self, n_input, n_output=None, layers=[], act=nn.ReLU(True), dropout=[], 
                 batch_norm=False, out_act=None, final_bn=False, layer_norm=False, 
                 final_ln=False):
        super().__init__()
        sizes = [n_input] + layers
        if n_output: 
            sizes += [n_output]
            dropout += [0.0]
        layers_ = []
        for i, (n_in, n_out, dr) in enumerate(zip(sizes[:-1], sizes[1:], dropout)):
            act_ = act if i < len(layers) else out_act
            batch_norm_ = batch_norm if i < len(layers) else final_bn
            layer_norm_ = layer_norm if i < len(layers) else final_ln
            layers_ += hidden_layer(n_in, n_out, batch_norm_, dr, layer_norm_, act_)      
        self.layers = nn.Sequential(*layers_)
        self.layers.apply(bn_init)
        
    def forward(self, x):
        return self.layers(x)

In [10]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = nn.BatchNorm1d(size) #nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
        bn_init(self.norm)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [11]:
def scatter_sum(src, idx, num=None, out=None):
    if not num: num = idx.max().item() + 1
    sz = num, src.size(1)
    exp_idx = idx[:,None].repeat(1, sz[1])
    if out is None:
        out = torch.zeros(sz, dtype=src.dtype, device=src.device)
    return out.scatter_add(0, exp_idx, src)

def scatter_mean(src, idx, num=None, out=None):
    return scatter_sum(src, idx, num, out) / scatter_sum(torch.ones_like(src), idx, num).clamp(1.0)

def softmax(x, idx, num=None):
    x = x.exp()
    x = x / (scatter_sum(x, idx, num=num)[idx] + 1e-16)
    return x
    
class ENNMessage(nn.Module):
    def __init__(self, n_h, n_e, stride, enn_args={}, ann_args=None):
        super().__init__()
        assert stride <= n_h
        self.n_h, self.stride = n_h, stride
        self.enn = FullyConnectedNet(n_e, n_h * self.stride, **enn_args)
        if ann_args: self.ann = FullyConnectedNet(1, n_h, **ann_args)
        else: self.ann = None
    
    def forward(self, h, e, pairs_idx, angles=None, angles_idx=None, t=0):
        # compute 'A(e)' and angle attention masks
        if t==0: 
            self.a_mat = self.get_a_mat(e)
            if self.ann: self.att = self.ann(angles.view(-1,1))
            self.pairs_idx = torch.cat((pairs_idx, pairs_idx[:, [1, 0]]))
            
        # compute 'm_{i} = sum_{j in N(i)}(A_{ij}h_{j})' for all nodes 'i'
        return self.add_message(torch.zeros_like(h), self.a_mat, h, 
                                self.pairs_idx, angles_idx)
    
    def get_a_mat(self, e):
        a_vect = self.enn(e) / (self.stride ** .5)
        a_mat = a_vect.view(-1, self.n_h, self.stride)
        return torch.cat([a_mat, a_mat])
    
    def add_message(self, m, a, h, pairs_idx, angles_idx=None):
        # select the 'h_{j}' feeding into the 'm_{i}'
        h_in = h.index_select(0, pairs_idx[:,1])
        
        # do the matrix multiplication 'A_{ij}h_{j}'
        if self.stride==self.n_h:
            ah = (h_in.unsqueeze(1) @ a).squeeze(1)
        else:
            h_unfolded = F.pad(h_in, self.n_pad).unfold(1, self.stride, 1)
            ah = (h_unfolded * a).sum(-1)
        
        # apply atttention
        if self.ann:
            ave_att = scatter_mean(self.att, angles_idx, num=pairs_idx.size(0), 
                                   out=torch.ones_like(ah))
            ah = ave_att * ah
        
        # Sum up all 'A_{ij}h_{j}' per node 'i'
        return m.scatter_add(0, pairs_idx[:,0,None].repeat(1, self.n_h), ah)
    
    @property
    def n_pad(self):
        return (self.stride // 2, self.stride // 2 - int(self.stride % 2 == 0))

In [12]:
class MultiHeadedDistAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.0):
        "Take in model size and number of heads."
        super().__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_model, self.d_k, self.h, self.attn = d_model, d_model // h, h, None
        self.linears = clones(nn.Linear(d_model, d_model), 2)
        self.dropout = nn.Dropout(p=dropout) if dropout > 0.0 else None
        
    def forward(self, dists, x, mask, exp_idx):
        x = self.linears[0](x).view(-1, self.h, self.d_k)
        x, self.attn = self.apply_attn(dists, x, mask, exp_idx)
        x = x.view(-1, self.h * self.d_k)
        return self.linears[-1](x)
    
    def apply_attn(self, dists, x, mask, exp_idx):
        attn = self.create_raw_attn(dists, mask)
        attn = unstack_batch(attn, mask, exp_idx, MAX_N_ATOMS)
        attn = attn.transpose(-2,-1).transpose(1, 2)
        if self.dropout: attn = self.dropout(attn)
        
        x = unstack_batch(x, mask, exp_idx, self.h).transpose(1, 2)
        x = torch.matmul(attn, x)
        x = x.transpose(1, 2).contiguous()
        return stack_batch(x, mask, self.d_model), attn
    
    def create_raw_attn(self, dists, mask):
        pass

In [13]:
class MultiHeadedGraphDistAttention(MultiHeadedDistAttention):
    def __init__(self, h, d_model, dropout=0.0):
        "Take in model size and number of heads."
        super().__init__(h, d_model, dropout)
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.embedding = nn.Embedding(11, h)
    
    def create_raw_attn(self, dists, mask):
        emb_dists = self.embedding(dists)
        emb_dists = emb_dists * mask[mask[:,0]==1].float().unsqueeze(-1)
        return F.softmax(emb_dists, dim=1)

In [14]:
class MultiHeadedEuclDistAttention(MultiHeadedDistAttention):
    def __init__(self, h, d_model, dropout=0.0):
        "Take in model size and number of heads."
        super().__init__(h, d_model, dropout)
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.log_prec = nn.Parameter(torch.Tensor(1, 1, h))
        self.locs = nn.Parameter(torch.Tensor(1, 1, h))
        nn.init.normal_(self.log_prec, mean=0.0, std=0.1)
        nn.init.normal_(self.locs, mean=0.0, std=1.0)
    
    def create_raw_attn(self, dists, mask):
        dists = dists.unsqueeze(-1).expand(-1, -1, self.h)
        z = torch.exp(self.log_prec) * (dists - self.locs)
        pdf = torch.exp(-0.5 * z ** 2)
        return pdf / pdf.sum(dim=1, keepdim=True)

In [15]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None: scores = scores.masked_fill(mask==0, -1e9)
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None: p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedSelfAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super().__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_model, self.d_k, self.h, self.attn = d_model, d_model // h, h, None
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.dropout = nn.Dropout(p=dropout) if dropout > 0.0 else None
        
    def forward(self, x, mask, exp_idx):
        # reshape x and mask
        x = unstack_batch(x, mask, exp_idx, self.d_model)
        mask_ = mask.view(-1, MAX_N_ATOMS, MAX_N_ATOMS)
        query, key, value = x, x, x
        
        # Same mask applied to all h heads.
        mask_ = mask_.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask_, dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.d_model)
        x = self.linears[-1](x)
        
        # return x to original size
        return stack_batch(x, mask, self.d_model)

In [16]:
class AttendingLayer(nn.Module):
    def __init__(self, size, eucl_dist_attn, graph_dist_attn, self_attn, ff, dropout):
        super().__init__()
        self.eucl_dist_attn = eucl_dist_attn
        self.graph_dist_attn = graph_dist_attn
        self.self_attn = self_attn
        self.ff = ff
        self.sublayer = clones(SublayerConnection(size, dropout), 4)
        self.size = size

    def forward(self, x, eucl_dists, graph_dists, mask, exp_idx):
        d_model = x.size(-1)
        x = self.sublayer[0](x, lambda x: self.eucl_dist_attn(eucl_dists, x, mask, exp_idx))
        x = self.sublayer[1](x, lambda x: self.graph_dist_attn(graph_dists, x, mask, exp_idx))
        x = self.sublayer[2](x, lambda x: self.self_attn(x, mask, exp_idx))
        return self.sublayer[3](x, self.ff)

In [17]:
class MessagePassingLayer(nn.Module):
    def __init__(self, size, bond_mess, sc_mess, dropout, N):
        super().__init__()
        self.bond_mess = bond_mess
        self.sc_mess = sc_mess
        self.linears = clones(nn.Linear(size, size), 2*N)
        self.sublayer = clones(SublayerConnection(size, dropout), 2*N)

    def forward(self, x, bond_x, sc_x, angles, mask, bond_idx, sc_idx, angles_idx, t=0):
        d_model = x.size(-1)
        x = self.sublayer[2*t](x, lambda x: self.linears[2*t](self.bond_mess(x, bond_x, bond_idx, angles, angles_idx, t=t)))
        return self.sublayer[(2*t)+1](x, lambda x: self.linears[(2*t)+1](self.sc_mess(x, sc_x, sc_idx, t=t)))

In [18]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, mess_pass_layer, attn_layer, N):
        super().__init__()
        self.mess_pass_layer = mess_pass_layer
        self.attn_layers = clones(attn_layer, N)
        self.norm = nn.BatchNorm1d(attn_layer.size) # nn.LayerNorm(attn_layer.size)
        bn_init(self.norm)
        
    def forward(self, x, bond_x, sc_x, eucl_dists, graph_dists, angles, mask, 
                bond_idx, sc_idx, angles_idx, exp_idx):
        "Pass the input (and mask) through each layer in turn."
        for t, attn_layer in enumerate(self.attn_layers):
            x = self.mess_pass_layer(x, bond_x, sc_x, angles, mask, bond_idx, sc_idx, angles_idx, t=t)
            x = attn_layer(x, eucl_dists, graph_dists, mask, exp_idx)
        return self.norm(x)

In [19]:
def create_contrib_net(n_in, n_h, act, dropout=0.0, layer_norm=True):
    layers = hidden_layer(n_in, n_h, False, dropout, layer_norm, act)
    layers += hidden_layer(n_h, 1, False, 0.0) # output layer
    return nn.Sequential(*layers)

class ContribsNet(nn.Module):
    N_CONTRIBS = 5
    CONTIB_SCALES = [1, 250, 45, 35, 500]
    
    def __init__(self, n_in, n_h, vec_in, act, dropout=0.0, layer_norm=True):
        super().__init__()
        self.blocks = nn.ModuleList([
            create_contrib_net(n_in, n_h, act, dropout, layer_norm) 
            for _ in range(self.N_CONTRIBS)
        ])
        
    def forward(self, x):
        ys = torch.cat([b(x) / s for b, s in zip(self.blocks, self.CONTIB_SCALES)], dim=-1)
        return torch.cat([ys[:,:-1], ys.sum(dim=-1, keepdim=True)], dim=-1)

class MyCustomHead(nn.Module):
    N_TYPES = 8
    
    def __init__(self, n_input, n_h, n_h_contribs, pre_layers=[], post_layers=[], 
                 act=nn.ReLU(True), dropout=3*[0.0], norm=False):
        super().__init__()
        true_n_input = n_input + 2 * n_h
        self.preproc = nn.Sequential(*hidden_layer(n_input, n_h, False, dropout[0], norm, act))
        self.types_net = nn.ModuleList([
            nn.Sequential(*hidden_layer(n_h, n_input, False, dropout[1], norm, act))
            for _ in range(self.N_TYPES)
        ])
        self.contribs_net = ContribsNet(n_input, n_h_contribs, n_h, act, dropout[2], layer_norm=norm)
        
    def forward(self, x, sc_types):
        x_ = self.preproc(x)
        x_types = torch.zeros_like(x)
        for i in range(self.N_TYPES):
            if torch.any(sc_types==i): x_types[sc_types==i] = self.types_net[i](x_[sc_types==i])
        x = x + x_types 
        y = self.contribs_net(x)
        return y

In [20]:
class Transformer(nn.Module):
    def __init__(self, d_x, d_bond, d_sc_pair, d_sc_m, N=6, d_model=512, d_ff=2048, 
                 d_ff_contrib=128, h=8, dropout=0.1, stride=128, enn_args={}, ann_args={}):
        super().__init__()
        c = copy.deepcopy
        bond_mess = ENNMessage(d_model, d_bond, stride, enn_args, ann_args)
        sc_mess = ENNMessage(d_model, d_sc_pair, stride, enn_args)
        eucl_dist_attn = MultiHeadedEuclDistAttention(h, d_model)
        graph_dist_attn = MultiHeadedGraphDistAttention(h, d_model)
        self_attn = MultiHeadedSelfAttention(h, d_model, dropout)
        ff = FullyConnectedNet(d_model, d_model, [d_ff], dropout=[dropout])
        
        message_passing_layer = MessagePassingLayer(d_model, bond_mess, sc_mess, dropout, N)
        attending_layer = AttendingLayer(d_model, c(eucl_dist_attn), c(graph_dist_attn), 
                                         c(self_attn), c(ff), dropout)
        
        self.projection = nn.Linear(d_x, d_model)
        self.encoder = Encoder(message_passing_layer, attending_layer, N)
        self.write_head = MyCustomHead(2 * d_model + d_sc_m, d_ff, d_ff_contrib, norm=True)
        
    def forward(self, x, bond_x, sc_x, sc_m_x, eucl_dists, graph_dists, angles, mask, bond_idx, 
                sc_idx, angle_idx, exp_idx, sc_types):
        x = self.encoder(self.projection(x), bond_x, sc_x, eucl_dists, graph_dists, 
                         angles, mask, bond_idx, sc_idx, angle_idx, exp_idx)
        x = torch.cat([x.index_select(0, sc_idx[:,0]), x.index_select(0, sc_idx[:,1]), sc_m_x], dim=-1)
        return self.write_head(x, sc_types)
        

## Train the model

In [21]:
def set_seed(seed=100):
    # python RNG
    random.seed(seed)

    # pytorch RNGs
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

    # numpy RNG
    np.random.seed(seed)

In [22]:
set_seed(100)
mol_ids = train_df['molecule_id'].unique()
n_obs = len(mol_ids)
split = int(n_obs*0.75)
mol_ids_ = np.random.choice(mol_ids, size=n_obs, replace=False)
train_mol_ids, val_mol_ids = pd.Series(mol_ids_[:split]), pd.Series(mol_ids_[split:])

Scale features.

In [23]:
def scale_features(df, features, train_mol_ids):
    idx = df['molecule_id'].isin(train_mol_ids)
    return df.loc[idx, features].mean(), df.loc[idx, features].std()

In [24]:
if any(train_df[SC_FEATS_TO_SCALE].mean().abs()>0.1) or any((train_df[SC_FEATS_TO_SCALE].std()-1.0).abs()>0.1):
    sc_feat_means, sc_feat_stds = scale_features(train_df, SC_FEATS_TO_SCALE, train_mol_ids)
    train_df[SC_FEATS_TO_SCALE] = (train_df[SC_FEATS_TO_SCALE] - sc_feat_means) / sc_feat_stds
    test_df[SC_FEATS_TO_SCALE] = (test_df[SC_FEATS_TO_SCALE] - sc_feat_means) / sc_feat_stds
if any(atom_df[ATOM_FEATS_TO_SCALE].mean().abs()>0.1) or any((atom_df[ATOM_FEATS_TO_SCALE].std()-1.0).abs()>0.1):
    atom_feat_means, atom_feat_stds = scale_features(atom_df, ATOM_FEATS_TO_SCALE, train_mol_ids)
    atom_df[ATOM_FEATS_TO_SCALE] = (atom_df[ATOM_FEATS_TO_SCALE] - atom_feat_means) / atom_feat_stds
if any(edge_df[EDGE_FEATS_TO_SCALE].mean().abs()>0.1) or any((edge_df[EDGE_FEATS_TO_SCALE].std()-1.0).abs()>0.1):
    edge_feat_means, edge_feat_stds = scale_features(edge_df, EDGE_FEATS_TO_SCALE, train_mol_ids)
    edge_df[EDGE_FEATS_TO_SCALE] = (edge_df[EDGE_FEATS_TO_SCALE] - edge_feat_means) / edge_feat_stds

In [25]:
gb_mol_sc = train_df.groupby('molecule_id')
test_gb_mol_sc = test_df.groupby('molecule_id')
gb_mol_atom = atom_df.groupby('molecule_id')
gb_mol_edge = edge_df.groupby('molecule_id')
gb_mol_struct = structures_df.groupby('molecule_id')
gb_mol_angle_in = angle_in_df.groupby('molecule_id')
gb_mol_angle_out = angle_out_df.groupby('molecule_id')
gb_mol_graph_dist = graph_dist_df.groupby('molecule_id')

Define the pytorch dataset class.

In [26]:
def get_existing_group(gb, i):
    try: group_df = gb.get_group(i)
    except KeyError: group_df = None
    return group_df

class MoleculeDataset(Dataset):
    def __init__(self, mol_ids, gb_mol_sc, gb_mol_atom, gb_mol_edge, gb_mol_struct, 
                 gb_mol_angle_in, gb_mol_angle_out, gb_mol_graph_dist):
        self.n = len(mol_ids)
        self.mol_ids = mol_ids
        self.gb_mol_sc = gb_mol_sc
        self.gb_mol_atom = gb_mol_atom
        self.gb_mol_edge = gb_mol_edge
        self.gb_mol_struct = gb_mol_struct
        self.gb_mol_angle_in = gb_mol_angle_in
        self.gb_mol_angle_out = gb_mol_angle_out
        self.gb_mol_graph_dist = gb_mol_graph_dist

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        return (self.gb_mol_sc.get_group(self.mol_ids[idx]),
                self.gb_mol_atom.get_group(self.mol_ids[idx]), 
                self.gb_mol_edge.get_group(self.mol_ids[idx]), 
                self.gb_mol_struct.get_group(self.mol_ids[idx]), 
                self.gb_mol_angle_in.get_group(self.mol_ids[idx]), 
                get_existing_group(self.gb_mol_angle_out, self.mol_ids[idx]),
                self.gb_mol_graph_dist.get_group(self.mol_ids[idx]))

def np_lst_to_torch(arr_lst, dtype=torch.float):
    return torch.from_numpy(np.ascontiguousarray(np.concatenate(arr_lst))).type(dtype)

def get_dist_matrix(struct_df):
    locs = struct_df[['x','y','z']].values
    n_atoms = len(locs)
    loc_tile = np.tile(locs.T, (n_atoms,1,1))
    dist_mat = np.sqrt(((loc_tile - loc_tile.T)**2).sum(axis=1))
    return dist_mat

def collate_fn(batch, test=False):
    batch_size, n_atom_sum, n_pairs_sum = len(batch), 0, 0
    x, bond_x, sc_x, sc_m_x = [], [], [], []
    eucl_dists, graph_dists, angles_in, angles_out = [], [], [], []
    mask, bond_idx, sc_idx, exp_idx, angles_in_idx, angles_out_idx = [], [], [], [], [], []
    sc_types, sc_vals = [], []

    for b in range(batch_size):
        sc_df, atom_df, edge_df, struct_df, angle_in_df, angle_out_df, graph_dist_df = batch[b]
        n_atoms, n_pairs = len(atom_df), len(edge_df)
        n_pad = MAX_N_ATOMS - n_atoms
        assert len(struct_df)==n_atoms
        eucl_dists_ = get_dist_matrix(struct_df)
        eucl_dists_ = np.pad(eucl_dists_, [(0, 0), (0, n_pad)] , 'constant', constant_values=999)
        
        x.append(atom_df[ATOM_FEATS].values)
        bond_x.append(edge_df[EDGE_FEATS].values)
        sc_x.append(sc_df[SC_EDGE_FEATS].values)
        sc_m_x.append(sc_df[SC_MOL_FEATS].values)
        sc_types.append(sc_df['type'].values)
        if not test: sc_vals.append(sc_df[CONTRIB_COLS+[TARGET_COL]].values)
        eucl_dists.append(eucl_dists_)
        graph_dists.append(graph_dist_df.values[:,:-1])
        angles_in.append(angle_in_df['cos_angle'].values)
        if angle_out_df is not None: angles_out.append(angle_out_df['cos_angle'].values)
        
        mask.append(np.pad(np.ones(2 * [n_atoms]), [(0, n_pad), (0, n_pad)] , 'constant'))
        exp_idx.append(np.arange(n_atoms) + n_atom_sum)
        exp_idx.append(np.array((MAX_N_ATOMS - n_atoms) * [n_atoms + n_atom_sum - 1]))
        bond_idx.append(edge_df[['idx_0', 'idx_1']].values + n_atom_sum)
        sc_idx.append(sc_df[['atom_index_0', 'atom_index_1']].values + n_atom_sum)
        angles_in_idx.append(angle_in_df['p_idx'].values + n_pairs_sum)
        if angle_out_df is not None: angles_out_idx.append(angle_out_df['p_idx'].values + n_pairs_sum)
        
        n_atom_sum += n_atoms
        n_pairs_sum += n_pairs
        
    x, bond_x = np_lst_to_torch(x), np_lst_to_torch(bond_x), 
    sc_x, sc_m_x = np_lst_to_torch(sc_x), np_lst_to_torch(sc_m_x)
    if not test: sc_vals = np_lst_to_torch(sc_vals)
    else: sc_vals = torch.tensor([0] * len(sc_types))
    sc_types = np_lst_to_torch(sc_types, torch.long)
    mask = np_lst_to_torch(mask, torch.uint8)
    exp_idx = np_lst_to_torch(exp_idx, torch.long)
    bond_idx = np_lst_to_torch(bond_idx, torch.long)
    sc_idx = np_lst_to_torch(sc_idx, torch.long)
    angles_in_idx = np_lst_to_torch(angles_in_idx, torch.long)
    angles_out_idx = np_lst_to_torch(angles_out_idx, torch.long) + n_pairs_sum
    angles_idx = torch.cat((angles_in_idx, angles_out_idx))
    eucl_dists = np_lst_to_torch(eucl_dists)
    graph_dists = np_lst_to_torch(graph_dists, torch.long)
    angles = np_lst_to_torch(angles_in + angles_out)
    
    return (x, bond_x, sc_x, sc_m_x, eucl_dists, graph_dists, angles, mask, 
            bond_idx, sc_idx, angles_idx, exp_idx, sc_types), sc_vals

In [27]:
set_seed(100)
batch_size = 20

In [28]:
train_ds = MoleculeDataset(train_mol_ids, gb_mol_sc, gb_mol_atom, gb_mol_edge, gb_mol_struct, gb_mol_angle_in, gb_mol_angle_out, gb_mol_graph_dist)
val_ds   = MoleculeDataset(val_mol_ids, gb_mol_sc, gb_mol_atom, gb_mol_edge, gb_mol_struct, gb_mol_angle_in, gb_mol_angle_out, gb_mol_graph_dist)
test_ds  = MoleculeDataset(test_mol_ids, test_gb_mol_sc, gb_mol_atom, gb_mol_edge, gb_mol_struct, gb_mol_angle_in, gb_mol_angle_out, gb_mol_graph_dist)
train_dl = DataLoader(train_ds, batch_size, shuffle=True, num_workers=8)
val_dl   = DataLoader(val_ds, batch_size, num_workers=8)
test_dl  = DeviceDataLoader.create(test_ds, batch_size, num_workers=8, collate_fn=partial(collate_fn, test=True))
db = DataBunch(train_dl, val_dl, collate_fn=collate_fn)
db.test_dl = test_dl

In [29]:
batch = next(iter(train_dl))

In [30]:
for el in batch[0]: print(el.size())
print(batch[1].size())

torch.Size([375, 21])
torch.Size([387, 8])
torch.Size([1144, 16])
torch.Size([1144, 25])
torch.Size([375, 29])
torch.Size([375, 29])
torch.Size([1454])
torch.Size([580, 29])
torch.Size([387, 2])
torch.Size([1144, 2])
torch.Size([1454])
torch.Size([580])
torch.Size([1144])
torch.Size([1144, 5])


In [31]:
b_dict = dict(x=batch[0][0], 
              bond_x=batch[0][1], 
              sc_x=batch[0][2], 
              sc_m_x=batch[0][3], 
              eucl_dists=batch[0][4], 
              graph_dists=batch[0][5], 
              angles=batch[0][6], 
              mask=batch[0][7], 
              bond_idx=batch[0][8], 
              sc_idx=batch[0][9],
              angles_idx=batch[0][10],
              exp_idx=batch[0][11], 
              sc_types=batch[0][12], 
              y=batch[1])
for k,v in b_dict.items(): print(f'{k}:\n {v}')

x:
 tensor([[ 0.0000,  1.0000,  0.0000,  ...,  0.8485, -2.6219, -0.3054],
        [ 0.0000,  1.0000,  0.0000,  ...,  0.7292, -0.1759, -0.3054],
        [ 0.0000,  1.0000,  0.0000,  ...,  0.8494, -2.6219, -0.3054],
        ...,
        [ 1.0000,  0.0000,  0.0000,  ...,  0.9419,  0.4355, -0.3054],
        [ 1.0000,  0.0000,  0.0000,  ...,  0.9272,  0.4355, -0.3054],
        [ 1.0000,  0.0000,  0.0000,  ...,  0.9257,  0.4355, -0.3054]])
bond_x:
 tensor([[ 1.0000,  0.0000,  0.0000,  ...,  0.0000,  1.3088,  1.3957],
        [ 1.0000,  0.0000,  0.0000,  ...,  0.0000, -0.8758, -0.7402],
        [ 1.0000,  0.0000,  0.0000,  ...,  0.0000, -0.8752, -0.7397],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  1.0000,  0.6314,  0.9587],
        [ 0.0000,  0.0000,  0.0000,  ...,  1.0000,  0.1754,  0.2646],
        [ 1.0000,  0.0000,  0.0000,  ...,  0.0000, -0.9483, -1.4458]])
sc_x:
 tensor([[ 1.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000, -0.3127],
        [ 0.0000,  0.0000,  0.0000,  ..., 

Implement the metric used for this competition.

In [32]:
def group_mean_log_mae(y_true, y_pred, types, epoch):
    proc = lambda x: x.cpu().numpy().ravel() 
    y_true, y_pred, types = proc(y_true), proc(y_pred), proc(types)
    y_true = SC_MEAN + y_true * SC_STD
    y_pred = SC_MEAN + y_pred * SC_STD
    maes = pd.Series(y_true - y_pred).abs().groupby(types).mean()
    gmlmae = np.log(maes).mean()
    # print(f'Epoch: {epoch} - Group Mean Log Mae: {gmlmae}')
    return gmlmae

class GroupMeanLogMAE(Callback):
    _order = -20 #Needs to run before the recorder

    def __init__(self, learn, **kwargs): self.learn = learn
    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['group_mean_log_mae'])
    def on_epoch_begin(self, **kwargs): self.input, self.output, self.target = [], [], []
    
    def on_batch_end(self, last_target, last_output, last_input, train, **kwargs):
        if not train:
            self.input.append(last_input[-1])
            self.output.append(last_output[:,-1])
            self.target.append(last_target[:,-1])
                
    def on_epoch_end(self, epoch, last_metrics, **kwargs):
        if (len(self.input) > 0) and (len(self.output) > 0):
            inputs = torch.cat(self.input)
            preds = torch.cat(self.output)
            target = torch.cat(self.target)
            metric = group_mean_log_mae(preds, target, inputs, epoch)
            return add_metrics(last_metrics, [metric])

def contribs_rmse_loss(preds, targs):
    """
    Returns the sum of RMSEs for each sc contribution and total sc value.
    
    Args:
        - preds: tensor of shape (batch_size * n_sc, 5) containing 
            predictions. Last column is the total scalar coupling value.
        - targs: tensor of shape (batch_size * n_sc, 5) containing 
            true values. Last column is the total scalar coupling value.
    """
    return torch.mean((preds - targs) ** 2, dim=0).sqrt().sum()

def rmse(preds, targs):
    return torch.sqrt(F.mse_loss(preds[:,-1], targs[:,-1]))

def mae(preds, targs):
    return torch.abs(preds[:,-1] - targs[:,-1]).mean()

In [33]:
wd = 1e-2
d_x, d_model, d_bond, d_sc_pair, d_sc_m = N_ATOM_FEATURES, 512, N_EDGE_FEATURES, N_SC_EDGE_FEATURES, N_SC_MOL_FEATURES
enn_args = dict(layers=3*[d_model], dropout=3*[0.0], batch_norm=True)
ann_args = dict(layers=1*[d_model], dropout=1*[0.0], batch_norm=True, out_act=nn.Tanh())

In [34]:
set_seed(100)
model = Transformer(d_x, d_bond, d_sc_pair, d_sc_m, N=6, d_model=d_model, d_ff=d_model*4, d_ff_contrib=d_model//4, 
                    h=8, dropout=0.0, stride=128, enn_args=enn_args, ann_args=ann_args)

In [35]:
print(model)
print(model(*batch[0]))
print(model(*batch[0]).size())

Transformer(
  (projection): Linear(in_features=21, out_features=512, bias=True)
  (encoder): Encoder(
    (mess_pass_layer): MessagePassingLayer(
      (bond_mess): ENNMessage(
        (enn): FullyConnectedNet(
          (layers): Sequential(
            (0): Linear(in_features=8, out_features=512, bias=True)
            (1): ReLU(inplace)
            (2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
            (4): ReLU(inplace)
            (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (6): Linear(in_features=512, out_features=512, bias=True)
            (7): ReLU(inplace)
            (8): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (9): Linear(in_features=512, out_features=65536, bias=True)
          )
        )
        (ann): FullyConnectedNet(
          (layers): Sequential(
 

tensor([[-1.0633e+00, -2.0671e-03,  4.4499e-03, -2.1575e-04, -1.0633e+00],
        [ 2.3078e-01,  9.3059e-04, -1.2115e-02,  2.1701e-02,  2.4060e-01],
        [ 6.0988e-01, -2.0550e-03, -4.9458e-03,  3.1262e-03,  6.0697e-01],
        ...,
        [ 5.5447e-01,  7.0328e-04, -2.8908e-02, -1.6123e-02,  5.1079e-01],
        [ 8.4897e-01, -4.0527e-04, -8.0956e-03,  2.5952e-02,  8.6873e-01],
        [-1.6030e-01, -5.5304e-03, -4.2218e-03,  5.5054e-04, -1.6857e-01]],
       grad_fn=<CatBackward>)
torch.Size([1144, 5])


In [36]:
class GradientClipping(LearnerCallback):
    "Gradient clipping during training."
    def __init__(self, learn:Learner, clip:float = 0., start_it:int = 100):
        super().__init__(learn)
        self.clip, self.start_it = clip, start_it

    def on_backward_end(self, iteration, **kwargs):
        "Clip the gradient before the optimizer step."
        if self.clip and (iteration > self.start_it): nn.utils.clip_grad_norm_(self.learn.model.parameters(), self.clip)

In [78]:
learn = Learner(db, model, metrics=[rmse, mae], opt_func=partial(AdamW, betas=(0.95, 0.99)),
                callback_fns=[partial(GradientClipping, clip=10), GroupMeanLogMAE], 
                wd=wd, loss_func=contribs_rmse_loss)

In [80]:
learn.create_opt(1e-3, 1e-2)

In [84]:
learn.opt

OptimWrapper over Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.95, 0.99)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0

Parameter Group 1
    amsgrad: False
    betas: (0.95, 0.99)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
).
True weight decay: True

In [None]:
learn.lr_find(start_lr=1e-7, end_lr=1.0, num_it=100, stop_div=True)
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(10, max_lr=1e-3, callbacks=[SaveModelCallback(learn, every='improvement', mode='min',
                                                                  monitor='group_mean_log_mae',  
                                                                  name=f'mol_transformer_v{VERSION}_fold{FOLD_ID}')])

epoch,train_loss,valid_loss,rmse,mae,group_mean_log_mae,time
0,0.063132,0.06079,0.028409,0.019507,-0.46427,15:38
1,0.050103,0.048432,0.022839,0.015548,-0.616353,15:44
2,0.038285,0.038551,0.018126,0.01204,-0.992754,15:40
3,0.032635,0.033202,0.015629,0.00995,-1.134398,15:28
4,0.026503,0.027757,0.012954,0.008388,-1.335252,15:20
5,0.022509,0.023166,0.010843,0.006946,-1.561162,15:20
6,0.018938,0.020002,0.009322,0.005999,-1.668216,15:21
7,0.015883,0.01775,0.008247,0.005192,-1.843306,15:19


Better model found at epoch 0 with group_mean_log_mae value: -0.4642700254917145.
Better model found at epoch 1 with group_mean_log_mae value: -0.616352915763855.
Better model found at epoch 2 with group_mean_log_mae value: -0.9927537441253662.
Better model found at epoch 3 with group_mean_log_mae value: -1.1343979835510254.
Better model found at epoch 4 with group_mean_log_mae value: -1.335252046585083.
Better model found at epoch 5 with group_mean_log_mae value: -1.5611624717712402.
Better model found at epoch 6 with group_mean_log_mae value: -1.6682157516479492.
Better model found at epoch 7 with group_mean_log_mae value: -1.8433058261871338.


In [None]:
learn.recorder.plot_losses(skip_start=500)

In [None]:
val_contrib_preds = learn.get_preds(DatasetType.Valid)
test_contrib_preds = learn.get_preds(DatasetType.Test)

In [None]:
val_preds = val_contrib_preds[0][:,-1].detach().numpy() * SC_STD + SC_MEAN
test_preds = test_contrib_preds[0][:,-1].detach().numpy() * SC_STD + SC_MEAN

In [None]:
def store_submit(predictions):
    submit = pd.read_csv(DATA_PATH + 'sample_submission.csv')
    print(len(submit), len(predictions))   
    submit['scalar_coupling_constant'] = predictions
    submit.to_csv(f'mpnn-v{VERSION}-idx{FOLD_ID}-submission.csv', index=False)

def store_oof(predictions, val_ids):
    oof = pd.DataFrame(predictions, columns=['scalar_coupling_constants'])
    print(oof.head())
    oof.to_csv(f'mpnn-v{VERSION}-idx{FOLD_ID}-oof.csv')

In [None]:
store_submit(test_preds)
store_oof(val_preds, val_mol_ids)