In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import partial

from fastai.tabular import *
from fastai.callbacks import SaveModelCallback
from fastai.basic_data import DataBunch

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# __print__ = print
# def print(*strings):
#     for string in strings:
#         os.system(f'echo \"{string}\"')
#         __print__(string)

## Define some constants

In [3]:
FOLD_ID = 2
VERSION = 2

TYPES              = np.array(['1JHC', '2JHH', '1JHN', '2JHN', '2JHC', '3JHH', '3JHC', '3JHN'])
TYPES_MAP          = {t: i for i, t in enumerate(TYPES)}
SC_EDGE_FEATS      = ['type_0', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6', 'type_7', 
                      'dist', 'dist_min_rad', 'dist_electro_neg_adj', 'normed_dist', 
                      'diangle', 'cos_angle', 'cos_angle0', 'cos_angle1', 
                      #'inv_dist', 'normed_inv_dist'
                     ]
SC_MOL_FEATS       = ['type_0', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6', 'type_7', 
                      'dist', 'dist_min_rad', 'dist_electro_neg_adj', 'normed_dist', 
                      'diangle', 'cos_angle', 'cos_angle0', 'cos_angle1', 
                      'num_atoms', 'num_C_atoms', 'num_F_atoms', 'num_H_atoms', 'num_N_atoms', 'num_O_atoms', 
                      #'inv_dist', 'normed_inv_dist', 
                      'std_bond_length', 'ave_bond_length', #'total_bond_length',  
                      #'ave_inv_bond_length', 'total_inv_bond_length', 
                      'ave_atom_weight'#, 'total_atom_weight'
                     ]
ATOM_FEATS         = ['type_H', 'type_C', 'type_N', 'type_O', 'type_F', 
                      'degree_1', 'degree_2', 'degree_3', 'degree_4', 'degree_5', 
                      'SP', 'SP2', 'SP3', 'hybridization_unspecified', 
                      'aromatic', 'formal_charge', 'atomic_num',
                      'donor', 'acceptor', 
                      'ave_bond_length', 
                      #'ave_inv_bond_length',
                      'ave_neighbor_weight']
EDGE_FEATS         = ['single', 'double', 'triple', 'aromatic', 
                      'conjugated', 'in_ring',
                      'dist', 'normed_dist', 
                      #'inv_dist', 'normed_inv_dist'
                     ]
TARGET_COL         = 'scalar_coupling_constant'
CONTRIB_COLS       = ['fc', 'sd', 'pso', 'dso']
N_EDGE_FEATURES    = len(EDGE_FEATS)
N_SC_EDGE_FEATURES = len(SC_EDGE_FEATS)
N_SC_MOL_FEATURES  = len(SC_MOL_FEATS)
N_ATOM_FEATURES    = len(ATOM_FEATS)
N_TYPES            = len(TYPES)
N_MOLS             = 130775
SC_MEAN            = 16
SC_STD             = 35

SC_FEATS_TO_SCALE   = ['dist', 'dist_min_rad', 'dist_electro_neg_adj', 'num_atoms', 'num_C_atoms', 
                       'num_F_atoms', 'num_H_atoms', 'num_N_atoms', 'num_O_atoms', 'inv_dist', 
                       'ave_bond_length', 'std_bond_length', 'total_bond_length',  'ave_inv_bond_length', 
                       'total_inv_bond_length', 'ave_atom_weight', 'total_atom_weight']
ATOM_FEATS_TO_SCALE = ['atomic_num', 'ave_bond_length', 'ave_inv_bond_length', 'ave_neighbor_weight']
EDGE_FEATS_TO_SCALE = ['dist', 'inv_dist']

## Import data

In [9]:
DATA_PATH = '../data/'
PATH = '../tmp/'
CV_IDXS_PATH = PATH
# DATA_PATH = '../input/champs-scalar-coupling/'
# PATH = '../input/champs-processed-data-2/'
# CV_IDXS_PATH = '../input/champs-cv-4-fold-idxs/'

In [10]:
def show_csv_files(path):
    files = os.listdir(path)
    files = [f for f in files if f.find('.csv') != -1]
    print(f'{path}:', files)
show_csv_files(PATH)
show_csv_files(DATA_PATH)
show_csv_files(CV_IDXS_PATH)

../tmp/: ['atomic_features.csv', 'train_proc_df.csv', 'mask.csv', 'train_idxs_8_fold_cv.csv', 'edge_mask.csv', 'atom_df.csv', 'pairs_idx.csv', 'edge_df.csv', 'train_idxs_4_fold_cv.csv', 'edge_features.csv', 'val_idxs_8_fold_cv.csv', 'val_idxs_4_fold_cv.csv', 'test_proc_df.csv']
../data/: ['scalar_coupling_contributions.csv', 'mulliken_charges.csv', 'structures.csv', 'test.csv', 'train.csv', 'magnetic_shielding_tensors.csv', 'dipole_moments.csv', 'sample_submission.csv', 'potential_energy.csv']
../tmp/: ['atomic_features.csv', 'train_proc_df.csv', 'mask.csv', 'train_idxs_8_fold_cv.csv', 'edge_mask.csv', 'atom_df.csv', 'pairs_idx.csv', 'edge_df.csv', 'train_idxs_4_fold_cv.csv', 'edge_features.csv', 'val_idxs_8_fold_cv.csv', 'val_idxs_4_fold_cv.csv', 'test_proc_df.csv']


In [11]:
train_df = pd.read_csv(PATH+'train_proc_df.csv', index_col=0)
test_df  = pd.read_csv(PATH+'test_proc_df.csv', index_col=0)
atom_df  = pd.read_csv(PATH+'atom_df.csv', index_col=0)
edge_df  = pd.read_csv(PATH+'edge_df.csv', index_col=0)

train_mol_ids = pd.read_csv(CV_IDXS_PATH+'train_idxs_4_fold_cv.csv', usecols=[0, FOLD_ID], index_col=0).dropna().astype(int).iloc[:,0]
val_mol_ids   = pd.read_csv(CV_IDXS_PATH+'val_idxs_4_fold_cv.csv', usecols=[0, FOLD_ID], index_col=0).dropna().astype(int).iloc[:,0]
test_mol_ids  = pd.Series(test_df['molecule_id'].unique())

contribs_df = pd.read_csv(DATA_PATH+'scalar_coupling_contributions.csv')
train_df = pd.concat((train_df, contribs_df[CONTRIB_COLS]), axis=1)
del contribs_df
gc.collect()

train_df[[TARGET_COL, 'fc']] = (train_df[[TARGET_COL, 'fc']] - SC_MEAN) / SC_STD
train_df[CONTRIB_COLS[1:]] = train_df[CONTRIB_COLS[1:]] / SC_STD

train_df['num_atoms'] = train_df[['num_C_atoms', 'num_F_atoms', 'num_H_atoms', 
                                  'num_N_atoms', 'num_O_atoms']].sum(axis=1)
test_df['num_atoms'] = test_df[['num_C_atoms', 'num_F_atoms', 'num_H_atoms', 
                                'num_N_atoms', 'num_O_atoms']].sum(axis=1)
train_df[['num_atoms', 'num_C_atoms', 'num_F_atoms', 'num_H_atoms', 'num_N_atoms', 'num_O_atoms']] /= 10
test_df[['num_atoms', 'num_C_atoms', 'num_F_atoms', 'num_H_atoms', 'num_N_atoms', 'num_O_atoms']] /= 10

  mask |= (ar1 == a)


In [12]:
train_df.head()

Unnamed: 0,atom_0,atom_1,atom_index_0,atom_index_1,cos_angle,cos_angle0,cos_angle1,diangle,dist,dist_electro_neg_adj,...,std_bond_length,total_bond_length,ave_inv_bond_length,total_inv_bond_length,ave_atom_weight,total_atom_weight,fc,sd,pso,dso
0,H,C,1,0,0.0,0.0,-0.333335,0.0,1.091953,2.593389,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,1.914926,0.007274,0.035961,0.007772
1,H,H,1,2,-0.333287,0.816483,0.816482,0.0,1.78312,3.922863,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,-0.77242,0.010085,0.081668,-0.098103
2,H,H,1,3,-0.333335,0.816498,0.816496,0.0,1.783147,3.922924,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,-0.772357,0.010084,0.081672,-0.098111
3,H,H,1,4,-0.333347,0.816502,0.8165,0.0,1.783157,3.922945,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,-0.77234,0.010084,0.081673,-0.098112
4,H,C,2,0,0.0,0.0,-0.333352,0.0,1.091952,2.593385,...,3e-06,4.367799,0.915793,3.663173,0.2,1.0,1.91492,0.007274,0.03596,0.007772


In [13]:
test_df.head()

Unnamed: 0,atom_0,atom_1,atom_index_0,atom_index_1,cos_angle,cos_angle0,cos_angle1,diangle,dist,dist_electro_neg_adj,...,type_7,inv_dist,normed_inv_dist,ave_bond_length,std_bond_length,total_bond_length,ave_inv_bond_length,total_inv_bond_length,ave_atom_weight,total_atom_weight
4658147,H,C,2,0,-1.0,1.0,-1.0,0.0,2.261178,5.370298,...,0,0.442247,-0.815269,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4
4658148,H,C,2,1,0.0,0.0,-1.0,0.0,1.062099,2.522485,...,0,0.941532,4.621731,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4
4658149,H,H,2,3,0.0,1.0,1.0,0.0,3.323277,7.31121,...,0,0.300908,-2.038452,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4
4658150,H,C,3,0,0.0,0.0,-1.0,0.0,1.062099,2.522485,...,0,0.941532,4.621731,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4
4658151,H,C,3,1,-1.0,1.0,-1.0,0.0,2.261178,5.370298,...,0,0.442247,-0.815269,1.107759,0.064573,3.323277,0.905679,2.717037,0.35,1.4


## Define SchNet

In [34]:
class Softplus2(nn.Module):
    def __init__(self): 
        super().__init__()
        
    def forward(self, x):
        return F.relu(x) + torch.log(0.5 * torch.exp(-x.abs()) + 0.5)

General dense feedforward NN

In [15]:
def bn_init(m): pass
#     if type(m) == nn.BatchNorm1d: 
#         nn.init.ones_(m.weight)
#         nn.init.zeros_(m.bias)

def selu_weights_init(m):
    if type(m) == nn.Linear:
        fan_in = m.weight.size(1)
        m.weight.data.normal_(0.0, 1.0 / math.sqrt(fan_in))
        m.bias.fill_(0.0)
    bn_init(m)

def relu_weights_init(m): 
#     if type(m) == nn.Linear:
#         nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
#         m.bias.data.fill_(0.0)
    bn_init(m)

def hidden_layer(n_in, n_out, batch_norm, dropout, layer_norm=False, act=None):
    layers = []
    layers.append(nn.Linear(n_in, n_out))
    if act: layers.append(act)
    if batch_norm: layers.append(nn.BatchNorm1d(n_out))
    if layer_norm: layers.append(nn.LayerNorm(n_out))
    if dropout != 0: layers.append(nn.Dropout(dropout))
    return layers

class FullyConnectedNet(nn.Module):
    
    def __init__(self, n_input, n_output=None, layers=[], act=nn.ReLU(True), dropout=[], 
                 batch_norm=False, out_act=None, final_bn=False, layer_norm=False, 
                 final_ln=False):
        super().__init__()
        sizes = [n_input] + layers
        if n_output: 
            sizes += [n_output]
            dropout += [0.0]
        layers_ = []
        for i, (n_in, n_out, dr) in enumerate(zip(sizes[:-1], sizes[1:], dropout)):
            act_ = act if i < len(layers) else out_act
            batch_norm_ = batch_norm if i < len(layers) else final_bn
            layer_norm_ = layer_norm if i < len(layers) else final_ln
            layers_ += hidden_layer(n_in, n_out, batch_norm_, dr, layer_norm_, act_)      
        self.layers = nn.Sequential(*layers_)
        if type(act) == nn.SELU: self.layers.apply(selu_weights_init)
        else: self.layers.apply(relu_weights_init)
        
    def forward(self, x):
        return self.layers(x)
    
class ResFullyConnectedNet(nn.Module):
    def __init__(self, n_input, n_output=None, layers=[], act=nn.ReLU(True), dropout=[], 
                 batch_norm=False, out_act=None, final_bn=False, layer_norm=False, 
                 final_ln=False):
        super().__init__()
        n_layers, sizes = len(layers), [n_input] + layers
        if n_output: 
            sizes += [n_output]
            dropout += [0.0]
        assert ((n_layers - 1) % 2) == 0
        self.n_blocks, blocks =(n_layers - 1) // 2, [], 
        self.fc1 = nn.Sequential(*hidden_layer(n_input, layers[0], batch_norm, 
                                               dropout.pop(0), layer_norm, act))
        for i in range(self.n_blocks):
            blocks.append(FullyConnectedNet(layers[2*i], layers[2*(i+1)], [layers[(2*i)+1]], act, 
                                            dropout[2*i:2*(i+1)], batch_norm, act, 
                                            batch_norm, layer_norm, layer_norm))
        self.blocks = nn.ModuleList(blocks)
        self.fc_out = nn.Sequential(*hidden_layer(layers[-1], n_output, final_bn, 
                                                  0.0, final_ln, out_act))
            
    def forward(self, x):
        x = self.fc1(x)
        for i in range(self.n_blocks):
            x_ = self.blocks[i](x)
            x = x + x_
        y = self.fc_out(x)
        return y

The LSTM cell as describedi in the set2set paper (https://arxiv.org/pdf/1511.06391.pdf). Doesn't take any inputs.

In [16]:
class HiddenLSTMCell(nn.Module):
    """Implements the LSTM cell update described in the sec 4.2 of https://arxiv.org/pdf/1511.06391.pdf."""
    
    def __init__(self, n_h_out):
        """This LSTM cell takes no external 'x' inputs, but has a hidden state appended with the 
        readout from a content based attention mechanism. Therefore the hidden state is of a dimension
        that is two times the number of nodes in the set."""
        super().__init__()
        self.n_h_out, self.n_h = n_h_out, n_h_out * 2 
        self.w_h = nn.Parameter(torch.Tensor(self.n_h, n_h_out * 4))
        self.b = nn.Parameter(torch.Tensor(n_h_out * 4))
        self.init_weights()
    
    def init_weights(self):
        for p in self.parameters():
            if p.data.ndimension() >= 2:
                nn.init.xavier_uniform_(p.data)
                # nn.init.orthogonal_(p.data)
            else: 
                nn.init.zeros_(p.data)
                # initialize the forget gate bias to 1
                p.data[self.n_h_out:self.n_h_out*2] = torch.ones(self.n_h_out)
        
    def forward(self, h_prev, c_prev):
        """Takes previuos hidden and cell states as arguments and performs a 
        single LSTM step using no external input.
        """
        n_h_ = self.n_h_out # number of output hidden states
        # batch the computations into a single matrix multiplication
        gates = h_prev @ self.w_h + self.b
        i_g, f_g, g, o_g = (
            torch.sigmoid(gates[:, :n_h_]), # input
            torch.sigmoid(gates[:, n_h_:n_h_*2]), # forget
            torch.tanh(gates[:, n_h_*2:n_h_*3]),
            torch.sigmoid(gates[:, n_h_*3:]), # output
        )
        c = f_g * c_prev + i_g * g
        h = o_g * torch.tanh(c)
        return h, c

In [17]:
class IndRNNCell(nn.Module):
    def __init__(self, n_in, n_h, act=nn.ReLU(True), layer_norm=True, dropout=0.0):
        super().__init__()
        self.lin = nn.Linear(n_in, n_h)
        act_ln_dr = [act]
        if layer_norm: act_ln_dr.append(nn.LayerNorm(n_h))
        if dropout!=0.0: act_ln_dr.append(nn.Dropout(dropout))
        self.act_ln_dr = nn.Sequential(*act_ln_dr)
        self.w_h = nn.Parameter(torch.Tensor(n_h))
        nn.init.uniform_(self.w_h, a=0, b=1)
        
    def forward(self, x, h_prev):
        h = self.act_ln_dr(self.lin(x) + self.w_h * h_prev)
        return h
        
class IndRNN(nn.Module):
    def __init__(self, n_x, n_h, n_layers, layer_norm=True, dropout=[]):
        super().__init__()
        self.n_layers = n_layers
        if len(dropout)==0: dropout = n_layers * [0.0]
        assert len(dropout) == n_layers
        layers = []
        for i, dr in enumerate(dropout):
            n_in = n_x if 1==0 else n_h
            layers.append(IndRNNCell(n_in, n_h, layer_norm=layer_norm, dropout=dr))
        self.layers = nn.ModuleList(layers)
            
    def forward(self, x, h_prev):
        h, hs = x, []
        for i in range(self.n_layers):
            h = self.layers[i](h, h_prev[i])
            hs.append(h)
        return h, torch.cat(hs, dim=0)
        
class ResIndRNN(nn.Module):
    def __init__(self, n_x, n_h, n_blocks, layer_norm=True, dropout=[]):
        super().__init__()
        self.n_blocks = n_blocks
        if len(dropout)==0: dropout = n_blocks * [0.0]
        assert len(dropout) == n_blocks
        blocks = []
        for i, dr in enumerate(dropout):
            n_in = n_x if 1==0 else n_h
            blocks.append(IndRNN(n_in, n_h, n_layers=2, layer_norm=layer_norm, dropout=2*[dr]))
        self.blocks = nn.ModuleList(blocks)
            
    def forward(self, x, h_prev):
        hs = []
        for i in range(self.n_blocks):
            h, hs_ = self.blocks[i](x, h_prev[(i*2):((i+1)*2)])
            x = h + x
            hs.append(hs_)
        return x, torch.cat(hs, dim=0)
            

Set2set module.

In [18]:
def scatter_sum(src, idx, num):
    sz = num, src.size(1)
    exp_idx = idx[:,None].repeat(1, sz[1])
    out = torch.zeros(sz, dtype=src.dtype, device=src.device)
    return out.scatter_add(0, exp_idx, src)

def scatter_mean(src, idx, num):
    return scatter_sum(src, idx, num) / scatter_sum(torch.ones_like(src), idx, num).clamp(1.0)

def softmax(x, idx, num=None):
    x = x.exp()
    x = x / (scatter_sum(x, idx, num=num)[idx] + 1e-16)
    return x

class SumReadout(nn.Module):
    def __init__(self): super().__init__()
    def forward(self, x, node_idx): return scatter_sum(x, node_idx, num=node_idx.max().item()+1)
    
class MeanReadout(nn.Module):
    def __init__(self): super().__init__()
    def forward(self, x, node_idx): return scatter_mean(x, node_idx, num=node_idx.max().item()+1)

class Set2SetIndRNN(nn.Module):
    def __init__(self, n_set_in, proc_steps, n_blocks=3):
        super().__init__()
        self.proc_steps = proc_steps
        self.gru = ResIndRNN(n_set_in, n_set_in, n_blocks, layer_norm=True, dropout=[])
        self.init_q = nn.Parameter(torch.zeros(2 * n_blocks, 1, n_set_in))
        self.init_r = nn.Parameter(torch.zeros(1, n_set_in))

    def forward(self, x, node_idx):
        """
        x - input tensor of shape (batch_size * n_nodes, in_channels).
        - node_idx: tensor of shape (batch_size * n_nodes) mapping each
            node to its corresponding index in the batch.
        """
        batch_size = node_idx.max().item() + 1
        qs = self.init_q.expand(-1, batch_size, -1).contiguous()
        r = self.init_r.expand(batch_size, -1).contiguous()
        for i in range(self.proc_steps):
            q, qs = self.gru(r, qs)
            e = (x * q[node_idx]).sum(dim=-1, keepdim=True)
            a = softmax(e, node_idx, num=batch_size)
            r = scatter_sum(a * x, node_idx, num=batch_size) # sum 'a*x' over nodes 
        return torch.cat([q, r], dim=-1) #q_star
     
class Set2SetGRU(nn.Module):
    def __init__(self, n_set_in, proc_steps):
        super().__init__()
        self.proc_steps = proc_steps
        self.gru = nn.GRUCell(n_set_in, n_set_in)
        self.init_q = nn.Parameter(torch.zeros(1, n_set_in))
        self.init_r = nn.Parameter(torch.zeros(1, n_set_in))

    def forward(self, x, node_idx):
        """
        x - input tensor of shape (batch_size * n_nodes, in_channels).
        - node_idx: tensor of shape (batch_size * n_nodes) mapping each
            node to its corresponding index in the batch.
        """
        batch_size = node_idx.max().item() + 1
        q = self.init_q.expand(batch_size, -1).contiguous()
        r = self.init_r.expand(batch_size, -1).contiguous()
        for i in range(self.proc_steps):
            q = self.gru(r, q)
            e = (x * q[node_idx]).sum(dim=-1, keepdim=True)
            a = softmax(e, node_idx, num=batch_size)
            r = scatter_sum(a * x, node_idx, num=batch_size) # sum 'a*x' over nodes 
        return torch.cat([q, r], dim=-1) #q_star

class Set2SetLSTM(nn.Module):
    """
    Adapted from: https://rusty1s.github.io/pytorch_geometric/build/html/_modules/torch_geometric\
        /nn/glob/set2set.html#Set2Set
    """
    def __init__(self, n_set_in, proc_steps):
        super().__init__()
        self.n_set_in, n_set_out = n_set_in, 2 * n_set_in
        self.proc_steps = proc_steps
        self.lstm = HiddenLSTMCell(n_set_in)
        self.init_q_star = nn.Parameter(torch.zeros(1, n_set_out))
        self.init_h = nn.Parameter(torch.zeros(1, n_set_in))

    def forward(self, x, node_idx):
        """
        x - input tensor of shape (batch_size * n_nodes, in_channels).
        - node_idx: tensor of shape (batch_size * n_nodes) mapping each
            node to its corresponding index in the batch.
        """
        batch_size = node_idx.max().item() + 1
        h = self.init_h.expand(batch_size, -1).contiguous()
        q_star = self.init_q_star.expand(batch_size, -1).contiguous()
        for i in range(self.proc_steps):
            q, h = self.lstm(q_star, h)
            e = (x * q[node_idx]).sum(dim=-1, keepdim=True)
            a = softmax(e, node_idx, num=batch_size)
            r = scatter_sum(a * x, node_idx, num=batch_size) # sum 'a*x' over nodes 
            q_star = torch.cat([q, r], dim=-1)
        return q_star

Edge network message function as described in the MPNN paper (https://arxiv.org/pdf/1704.01212.pdf). Adds in seperate edge network to allow messages to flow along scalar coupling edges.

In [19]:
class EdgeNetwork(nn.Module):
    def __init__(self, n_h, n_e, n_sc_e, net_args={}):
        super().__init__()
        self.n_h = n_h
        self.adj_net = FullyConnectedNet(n_e, n_h**2, **net_args)
        self.sc_adj_net = FullyConnectedNet(n_sc_e, n_h**2, **net_args)
        self.b = nn.Parameter(torch.Tensor(n_h)) # bias for the message function
        self.weight_inits()
        
    def weight_inits(self):
        nn.init.zeros_(self.b)
    
    def forward(self, h, e, sc_e, pairs_idx, sc_pairs_idx, t=0):
        """
        Compute message vector m_t given the previuos hidden state
        h_t-1 and edge features e.
        - h: tensor of hidden states of shape (batch_size * n_nodes, n_h)
        - e: tensor of edge features of shape (batch_size * n_edges, n_e).
        - sc_e: tensor of scalar coupling edge features of shape 
            (batch_size * n_sc, n_sc_e).
        - pairs_idx: tensor of shape (batch_size * n_edges, 2) mapping atom 
            indexes (first column) to the other atom indexes they form a 
            bond with (second column). Atom indices are unique to the entire
            batch.
        - sc_pairs_idx: tensor of shape (batch_size * n_sc, 2) containing atom 
            indices of the atoms for which the scalar coupling constant
            need to be predicted. Atom indices are unique to the entire
            batch.
        - t: update iteration. 
        """
        # compute 'A(e)'
        if t==0: 
            self.a_mat = self.get_a_mat(self.adj_net(e))
            self.a_sc_mat = self.get_a_mat(self.sc_adj_net(sc_e))
            
        # compute 'm_{i} = sum_{j in N(i)}(A_{ij}h_{j})' for all nodes 'i'
        m = self.add_message(torch.zeros_like(h), self.a_mat, h, pairs_idx)
        m = self.add_message(m, self.a_sc_mat, h, sc_pairs_idx)
        
        # add message bias
        m = m + self.b
        return m # apply optional batch norm
    
    def get_a_mat(self, a_vect):
        return a_vect.view(-1, self.n_h, self.n_h) / (self.n_h ** .5)
    
    def add_message(self, m, a, h, pairs_idx):
        # transform 'pairs_idx' and 'a' to make messages go both in to and out of all nodes
        in_out_idx = torch.cat((pairs_idx, pairs_idx[:, [1, 0]]))
        a_ = torch.cat((a, a)) 
        
        # select the 'h_{j}' feeding into the 'm_{i}'
        h_in = h.index_select(0, in_out_idx[:,1])
        
        # do the matrix multiplication 'A_{ij}h_{j}'
        ah = (h_in.unsqueeze(1) @ a_).squeeze(1)
        
        # Sum up all 'A_{ij}h_{j}' per node 'i'
        return m.scatter_add(0, in_out_idx[:,0,None].repeat(1, self.n_h), ah)

The GRU update function as described in the MPNN paper.

In [20]:
class GRUUpdate(nn.Module):
    def __init__(self, n_h):
        super().__init__()
        self.gru = nn.GRUCell(n_h, n_h)
        
    def forward(self, m, h_prev):
        """
        Update hidden state h.
        - h_prev is vector of hidden states of shape (batch_size * n_nodes, n_h)
        - m is vector of messages of shape (batch_size * n_nodes, n_h)
        """
        return self.gru(m, h_prev)

Custom readout network following th set2set processing stage. Allows some final specialization/fine-tuning for each scalar coupling type.

In [21]:
def create_contrib_head(n_in, n_h, act, dropout=0.0, layer_norm=True):
    layers = hidden_layer(n_in, n_h, False, dropout, layer_norm, act)
    layers += hidden_layer(n_h, 1, False, 0.0) # output layer
    return nn.Sequential(*layers)

class ContribsHead(nn.Module):
    N_CONTRIBS = 5
    CONTIB_SCALES = [1, 250, 45, 35, 500]
    
    def __init__(self, n_in, n_h, act, dropout=0.0, layer_norm=True):
        super().__init__()
        self.blocks = nn.ModuleList([
            create_contrib_head(n_in, n_h, act, dropout, layer_norm) 
            for _ in range(self.N_CONTRIBS)
        ])
        
    def forward(self, x):
        ys = torch.cat([b(x) / s for b, s in zip(self.blocks, self.CONTIB_SCALES)], dim=-1)
        return torch.cat([ys[:,:-1], ys.sum(dim=-1, keepdim=True)], dim=-1)

class MyCustomHead(nn.Module):
    N_OUTPUTS = 5
    
    def __init__(self, n_input, n_h_contribs, pre_layers=[], post_layers=[], 
                 act=nn.ReLU(True), dropout=[], norm=False):
        super().__init__()
        n_pre_layers = len(pre_layers)
        self.preproc = FullyConnectedNet(n_input, None, pre_layers, act, 
                                         dropout[:n_pre_layers], batch_norm=norm)
        self.types_proc = nn.ModuleList([
            FullyConnectedNet(pre_layers[-1], None, post_layers, act, dropout[n_pre_layers:-1], layer_norm=norm)
            for _ in range(N_TYPES)
        ])
        self.contribs_head = ContribsHead(post_layers[-1], n_h_contribs, act, dropout[-1], layer_norm=norm)
        
    def forward(self, x, sc_types):
        x_ = self.preproc(x)
        x_types = torch.zeros(x.size(0), x.size(1), device=x.device)
        for i in range(N_TYPES):
            if torch.any(sc_types == i): 
                x_types[sc_types == i] = self.types_proc[i](x_[sc_types == i])
        x = x + x_types 
        y = self.contribs_head(x)
        return y

In [22]:
class Set2SetOutput(nn.Module):
    def __init__(self, n_x, n_h, n_sc_m, proc_steps=10, readout_type='Set2SetGRU', net_args={}):
        super().__init__()
        self.R_proj = nn.Linear(n_h + n_x, n_h)
        if readout_type=='Sum': self.R_proc = SumReadout()
        if readout_type=='Mean': self.R_proc = MeanReadout()
        if readout_type=='Set2SetGRU': self.R_proc = Set2SetGRU(n_h, proc_steps)
        if readout_type=='Set2SetLSTM': self.R_proc = Set2SetLSTM(n_h, proc_steps)
        if readout_type=='Set2SetIndRNN': self.R_proc = Set2SetIndRNN(n_h, proc_steps)
        n_readout = n_h if readout_type[:7]!='Set2Set' else 2 * n_h 
        self.R_write = MyCustomHead(n_readout + (2 * n_h) + n_sc_m, **net_args)
    
    def forward(self, h, x, sc_m, node_idx, sc_idx, sc_pairs_idx, sc_types):
        """
        Make prediction.
        - h is vector of hidden states of shape (batch_size * n_nodes, n_h).
        - x is vector of input features of shape (batch_size * n_nodes, n_x).
        - sc_m: tensor of scalar coupling molecule level features of shape 
            (batch_size * n_sc, n_sc_m).
        - node_idx: tensor of shape (batch_size * n_nodes) mapping each
            node to its corresponding index in the batch.
        - sc_idx: tensor of shape (batch_size * n_sc) mapping each
            scalar coupling constant to its corresponding index in the batch.
        - sc_pairs_idx: tensor of shape (batch_size * n_sc, 2) containing atom 
            indices of the atoms for which the scalar coupling constant
            need to be predicted. Atom indices are unique to the entire
            batch.
        - sc_types: tensor of shape (batch_size * n_sc) containing the scalar 
            coupling type of each observation. 
        """
        m = self.R_proj(torch.cat([h, x], dim=1))
        q = self.R_proc(m, node_idx)
        
        # introduce skip connection to final node states of scalar coupling atoms
        inp = torch.cat([
            q.index_select(0, sc_idx),
            h.index_select(0, sc_pairs_idx[:,0]),
            h.index_select(0, sc_pairs_idx[:,1]),
            sc_m
        ], dim=-1)
        y = self.R_write(inp, sc_types)
        return y

In [None]:
class SchFilterNet(nn.Module):
    def __init__(self, n_in, n_h):
        super().__init__()
        self.n_in, self.n_h = n_in, n_h
        self.rbf_proc = FullyConnectedNet(n_in, None, 2*[n_h], act=Softplus2(), dropout=2*[0.0])

    def forward(self, rbf_dists):
        return self.rbf_proc(rbf_dists)

class IBlock(nn.Module):
    def __init__(self, n_in, n_h):
        super().__init__()
        self.n_h = n_h
        self.filter_net = SchFilterNet(n_in, n_h)
        self.sc_filter_net = SchFilterNet(n_in, n_h)
        self.pre_atom_wise = nn.Linear(n_h, n_h)
        self.post_atom_wise = FullyConnectedNet(n_h, None, 2*[n_h], act=Softplus2(), dropout=2*[0.0])

    def forward(self, h, rbf_dists, sc_rbf_dists, node_idx, pairs_idx, sc_pairs_idx):
        w = self.filter_net(rbf_dists)
        sc_w = self.filter_net(sc_rbf_dists)
        x = self.pre_atom_wise(h)
        v = torch.zeros_like(x)
        v = self.cf_conv(v, x, w, pairs_idx)
        v = self.cf_conv(v, x, sc_w, sc_pairs_idx)
        x = self.post_atom_wise(x)
        h = h + x
        return h
    
    def cf_conv(self, v, x, w, pairs_idx):
        # transform 'pairs_idx' and 'w' to make messages go both in to and out of all nodes
        in_out_idx = torch.cat((pairs_idx, pairs_idx[:, [1, 0]]))
        a_ = torch.cat((a, a)) 
        # select the 'x_{j}' feeding into the 'v_{i}'
        x_in = x.index_select(0, in_out_idx[:,1])
        # Sum up all 'A_{ij}h_{j}' per node 'i'
        return v.scatter_add(0, in_out_idx[:,0,None].repeat(1, self.n_h), w * x_in)


In [None]:
class SchNet(nn.Module):
    def __init__(self, n_h=64, n_i_blocks=6, n_x=None, readout_type='Sum', 
                 preproc_type='Embdedding', preproc_net_args={}, enn_args={}, R_net_args={}):
        super().__init__()
        assert int(cutoff / gap) == (cutoff / gap)
        n_rbfs = int(cutoff / gap)
        self.rbf = RBFLayer(cutoff=20, gap=0.1)
        if preproc_type=='Embdedding': self.preproc_net = nn.Embedding(5, n_h)
        else self.preproc_net = FullyConnectedNet(n_x, n_h, **preproc_net_args)
        self.i_blocks = nn.ModuleList(
            [IBlock(self.rbf.fan_out, n_h) for i in range(n_i_blocks)]
        )
        self.readout = ReadoutNet(n_x, n_h, n_sc_m, readout_type, R_net_args)
        self.n_i_blocks = n_i_blocks
        
    def forward(self, x, e, sc_e, sc_m, node_idx, pairs_idx, sc_idx, sc_pairs_idx, sc_types):
        """
        Args:
        - x: tensor of node features of shape (batch_size * n_nodes, n_x).
        - e: tensor of edge features of shape (batch_size * n_edges, n_e).
        - sc_e: tensor of scalar coupling edge features of shape 
            (batch_size * n_sc, n_sc_e).
        - sc_m: tensor of scalar coupling molecule level features of shape 
            (batch_size * n_sc, n_sc_m).
        - node_idx: tensor of shape (batch_size * n_nodes) mapping each
            node to its corresponding index in the batch.
        - pairs_idx: tensor of shape (batch_size * n_edges, 2) mapping atom 
            indexes (first column) to the other atom indexes they form a 
            bond with (second column). Atom indices are unique to the entire
            batch.
        - sc_idx: tensor of shape (batch_size * n_sc) mapping each
            scalar coupling constant to its corresponding index in the batch.
        - sc_pairs_idx: tensor of shape (batch_size * n_sc, 2) containing atom 
            indices of the atoms for which the scalar coupling constant
            need to be predicted. Atom indices are unique to the entire
            batch.
        - sc_types: tensor of shape (batch_size * n_sc) containing the scalar 
            coupling type of each observation. 
        """
        rbf_dists = self.rbf(e)
        sc_rbf_dists = self.rbf(sc_e)
        h = self.preproc_net(x)
        for t in range(self.n_i_blocks):
            h = self.i_blocks[t](h, rbf_dists, sc_rbf_dists, node_idx, pairs_idx, sc_pairs_idx)
        y = self.readout(h, x, sc_m, node_idx, sc_idx, sc_pairs_idx, sc_types)
        return y

## Train the model

In [None]:
def set_seed(seed=100):
    # python RNG
    random.seed(seed)

    # pytorch RNGs
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

    # numpy RNG
    np.random.seed(seed)

In [None]:
mol_ids = train_df['molecule_id'].unique()
n_obs = len(mol_ids)
split = int(n_obs*0.75)
set_seed(100)
mol_ids_ = np.random.choice(mol_ids, size=n_obs, replace=False)
train_mol_ids, val_mol_ids = pd.Series(mol_ids_[:split]), pd.Series(mol_ids_[split:])

Scale features.

In [None]:
def scale_features(df, features, train_mol_ids):
    idx = df['molecule_id'].isin(train_mol_ids)
    return df.loc[idx, features].mean(), df.loc[idx, features].std()

In [None]:
if any(train_df[SC_FEATS_TO_SCALE].mean().abs()>0.1) or any((train_df[SC_FEATS_TO_SCALE].std()-1.0).abs()>0.1):
    sc_feat_means, sc_feat_stds = scale_features(train_df, SC_FEATS_TO_SCALE, train_mol_ids)
    train_df[SC_FEATS_TO_SCALE] = (train_df[SC_FEATS_TO_SCALE] - sc_feat_means) / sc_feat_stds
    test_df[SC_FEATS_TO_SCALE] = (test_df[SC_FEATS_TO_SCALE] - sc_feat_means) / sc_feat_stds
if any(atom_df[ATOM_FEATS_TO_SCALE].mean().abs()>0.1) or any((atom_df[ATOM_FEATS_TO_SCALE].std()-1.0).abs()>0.1):
    atom_feat_means, atom_feat_stds = scale_features(atom_df, ATOM_FEATS_TO_SCALE, train_mol_ids)
    atom_df[ATOM_FEATS_TO_SCALE] = (atom_df[ATOM_FEATS_TO_SCALE] - atom_feat_means) / atom_feat_stds
if any(edge_df[EDGE_FEATS_TO_SCALE].mean().abs()>0.1) or any((edge_df[EDGE_FEATS_TO_SCALE].std()-1.0).abs()>0.1):
    edge_feat_means, edge_feat_stds = scale_features(edge_df, EDGE_FEATS_TO_SCALE, train_mol_ids)
    edge_df[EDGE_FEATS_TO_SCALE] = (edge_df[EDGE_FEATS_TO_SCALE] - edge_feat_means) / edge_feat_stds

In [None]:
gb_mol_sc = train_df.groupby('molecule_id')
test_gb_mol_sc = test_df.groupby('molecule_id')
gb_mol_atom = atom_df.groupby('molecule_id')
gb_mol_edge = edge_df.groupby('molecule_id')

Define the pytorch dataset class.

In [None]:

class MoleculeDataset(Dataset):
    
    def __init__(self, mol_ids, gb_mol_sc, gb_mol_atom, gb_mol_edge):
        self.n = len(mol_ids)
        self.mol_ids = mol_ids
        self.gb_mol_sc = gb_mol_sc
        self.gb_mol_atom = gb_mol_atom
        self.gb_mol_edge = gb_mol_edge

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        return (self.gb_mol_sc.get_group(self.mol_ids[idx]),
                self.gb_mol_atom.get_group(self.mol_ids[idx]), 
                self.gb_mol_edge.get_group(self.mol_ids[idx]))

def np_lst_to_torch(arr_lst, dtype=torch.float):
    return torch.from_numpy(np.ascontiguousarray(np.concatenate(arr_lst))).type(dtype)

def collate_fn(batch, test=False):
    batch_size, n_atom_sum = len(batch), 0
    x, e, sc_e, sc_m = [], [], [], []
    sc_types, sc_vals = [], []
    node_idx, pairs_idx, sc_pairs_idx, sc_idx = [], [], [], []

    for b in range(batch_size):
        sc_df, atom_df, edge_df = batch[b]
        n_atoms, n_sc = len(atom_df), len(sc_df)
        
        x.append(atom_df[ATOM_FEATS].values)
        e.append(edge_df[EDGE_FEATS].values)
        sc_e.append(sc_df[SC_EDGE_FEATS].values)
        sc_m.append(sc_df[SC_MOL_FEATS].values)
        sc_types.append(sc_df['type'].values)
        if not test: sc_vals.append(sc_df[CONTRIB_COLS+[TARGET_COL]].values)
        
        node_idx.append(np.repeat(b, n_atoms))
        sc_idx.append(np.repeat(b, n_sc))
        pairs_idx.append(edge_df[['idx_0', 'idx_1']].values + n_atom_sum)
        sc_pairs_idx.append(sc_df[['atom_index_0', 'atom_index_1']].values + n_atom_sum)
        
        n_atom_sum += n_atoms
        
    
    x, e = np_lst_to_torch(x), np_lst_to_torch(e), 
    sc_e, sc_m = np_lst_to_torch(sc_e), np_lst_to_torch(sc_m)
    if not test: sc_vals = np_lst_to_torch(sc_vals)
    else: sc_vals = torch.tensor([0] * len(sc_types))
    sc_types = np_lst_to_torch(sc_types, torch.long)
    node_idx = np_lst_to_torch(node_idx, torch.long)
    sc_idx = np_lst_to_torch(sc_idx, torch.long)
    pairs_idx = np_lst_to_torch(pairs_idx, torch.long)
    sc_pairs_idx = np_lst_to_torch(sc_pairs_idx, torch.long)
    
    return (x, e, sc_e, sc_m, node_idx, pairs_idx, sc_idx, sc_pairs_idx, sc_types), sc_vals

In [None]:
set_seed(100)
batch_size = 20

In [None]:
train_ds = MoleculeDataset(train_mol_ids, gb_mol_sc, gb_mol_atom, gb_mol_edge)
val_ds   = MoleculeDataset(val_mol_ids, gb_mol_sc, gb_mol_atom, gb_mol_edge)
test_ds  = MoleculeDataset(test_mol_ids, test_gb_mol_sc, gb_mol_atom, gb_mol_edge)
train_dl = DataLoader(train_ds, batch_size, shuffle=True, num_workers=8)
val_dl   = DataLoader(val_ds, batch_size, num_workers=8)
test_dl  = DeviceDataLoader.create(test_ds, batch_size, num_workers=8, collate_fn=partial(collate_fn, test=True))
db = DataBunch(train_dl, val_dl, collate_fn=collate_fn)
db.test_dl = test_dl

In [None]:
batch = next(iter(train_dl))

In [None]:
for el in batch[0]: print(el.size())
print(batch[1].size())

In [None]:
b_dict = dict(x=batch[0][0], 
              e=batch[0][1], 
              sc_e=batch[0][2], 
              sc_m=batch[0][3], 
              node_idx=batch[0][4], 
              pairs_idx=batch[0][5], 
              sc_idx=batch[0][6], 
              sc_pairs_idx=batch[0][7], 
              sc_types=batch[0][8], 
              y=batch[1])
for k,v in b_dict.items(): print(f'{k}:\n {v}')

Implement the metric used for this competition.

In [None]:
def group_mean_log_mae(y_true, y_pred, types, epoch):
    proc = lambda x: x.cpu().numpy().ravel() 
    y_true, y_pred, types = proc(y_true), proc(y_pred), proc(types)
    y_true = SC_MEAN + y_true * SC_STD
    y_pred = SC_MEAN + y_pred * SC_STD
    maes = pd.Series(y_true - y_pred).abs().groupby(types).mean()
    gmlmae = np.log(maes).mean()
    print(f'Epoch: {epoch} - Group Mean Log Mae: {gmlmae}')
    return gmlmae

class GroupMeanLogMAE(Callback):
    _order = -20 #Needs to run before the recorder

    def __init__(self, learn, **kwargs): self.learn = learn
    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['group_mean_log_mae'])
    def on_epoch_begin(self, **kwargs): self.input, self.output, self.target = [], [], []
    
    def on_batch_end(self, last_target, last_output, last_input, train, **kwargs):
        if not train:
            self.input.append(last_input[-1])
            self.output.append(last_output[:,-1])
            self.target.append(last_target[:,-1])
                
    def on_epoch_end(self, epoch, last_metrics, **kwargs):
        if (len(self.input) > 0) and (len(self.output) > 0):
            inputs = torch.cat(self.input)
            preds = torch.cat(self.output)
            target = torch.cat(self.target)
            metric = group_mean_log_mae(preds, target, inputs, epoch)
            return add_metrics(last_metrics, [metric])

def contribs_rmse_loss(preds, targs):
    """
    Returns the sum of RMSEs for each sc contribution and total sc value.
    
    Args:
        - preds: tensor of shape (batch_size * n_sc, 5) containing 
            predictions. Last column is the total scalar coupling value.
        - targs: tensor of shape (batch_size * n_sc, 5) containing 
            true values. Last column is the total scalar coupling value.
    """
    return torch.mean((preds - targs) ** 2, dim=0).sqrt().sum()

def rmse(preds, targs):
    return torch.sqrt(F.mse_loss(preds[:,-1], targs[:,-1]))

def mae(preds, targs):
    return torch.abs(preds[:,-1] - targs[:,-1]).mean()

In [None]:
wd, norm, act = 0, True, nn.ReLU(True)
update_steps, proc_steps, readout_type = 5, 6, 'Set2SetGRU'
n_x, n_h, n_e, n_sc_e, n_sc_m = N_ATOM_FEATURES, 300, N_EDGE_FEATURES, N_SC_EDGE_FEATURES, N_SC_MOL_FEATURES
preproc_net_args = dict(layers=[], act=act, dropout=[], out_act=nn.Tanh())
enn_args = dict(layers=3*[n_h], act=act, dropout=3*[0.0], batch_norm=norm)
R_net_args = dict(pre_layers=[1500], post_layers=[n_sc_m+4*n_h], n_h_contribs=200, act=act, dropout=[0.0, 0.0, 0.0], 
                  norm=norm)

In [None]:
set_seed(100)
model = MPNN(n_x, n_h, n_e, n_sc_e, n_sc_m, update_steps, proc_steps, readout_type, preproc_net_args, enn_args, R_net_args)

In [None]:
print(model)
print(model(*batch[0]))
print(model(*batch[0]).size())

In [None]:
class GradientClipping(LearnerCallback):
    "Gradient clipping during training."
    def __init__(self, learn:Learner, clip:float = 0., start_it:int = 100):
        super().__init__(learn)
        self.clip, self.start_it = clip, start_it

    def on_backward_end(self, iteration, **kwargs):
        "Clip the gradient before the optimizer step."
        if self.clip and (iteration > self.start_it): nn.utils.clip_grad_norm_(self.learn.model.parameters(), self.clip)

In [None]:
learn = Learner(db, model, metrics=[rmse, mae], 
                callback_fns=[partial(GradientClipping, clip=10), GroupMeanLogMAE], 
                wd=wd, loss_func=contribs_rmse_loss)

In [None]:
learn.lr_find(start_lr=1e-6, end_lr=1.0, num_it=100, stop_div=True)
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(10, max_lr=1e-3, callbacks=[SaveModelCallback(learn, every='improvement', mode='min',
                                                                  monitor='group_mean_log_mae',  name='mpnn1')])

In [None]:
learn.recorder.plot_losses(skip_start=450)

In [None]:
val_contrib_preds = learn.get_preds(DatasetType.Valid)
test_contrib_preds = learn.get_preds(DatasetType.Test)

In [None]:
val_preds = val_contrib_preds[0][:,-1].detach().numpy() * SC_STD + SC_MEAN
test_preds = test_contrib_preds[0][:,-1].detach().numpy() * SC_STD + SC_MEAN

In [None]:
def store_submit(predictions):
    submit = pd.read_csv(DATA_PATH + 'sample_submission.csv')
    print(len(submit), len(predictions))   
    submit['scalar_coupling_constant'] = predictions
    submit.to_csv(f'mpnn-v{VERSION}-idx{FOLD_ID}-submission.csv', index=False)

def store_oof(predictions, val_ids):
    oof = pd.DataFrame(predictions, columns=['scalar_coupling_constants'])
    print(oof.head())
    oof.to_csv(f'mpnn-v{VERSION}-idx{FOLD_ID}-oof.csv')

In [None]:
# store_submit(test_preds)
# store_oof(val_preds, val_mol_ids)