In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
import itertools

In [None]:
def metric(df, preds):
    df["prediction"] = preds
    maes = []
    for t in df.type.unique():
        y_true = df[df.type==t].scalar_coupling_constant.values
        y_pred = df[df.type==t].prediction.values
        mae = np.log(metrics.mean_absolute_error(y_true, y_pred))
        maes.append(mae)
    return np.mean(maes)

## Read data

In [None]:
def group_structures(df, struct_df, mulliken_df, potential):
    groups = {}
    struct_g = struct_df.groupby('molecule_name')
    mulliken_g = mulliken_df.groupby('molecule_name')
    for g, gdf in df.groupby('molecule_name'):
        groups[g] = (gdf, struct_g.get_group(g), 
                     mulliken_g.get_group(g), 
                     potential[potential.molecule_name == g])
    return groups

In [None]:
import os.path as osp
req_files = ['train.csv', 'structures.csv', 'mulliken_charges.csv', 'potential_energy.csv']
def read_csvs(path):
    read_csv = lambda x: pd.read_csv(osp.join(path, x))
    return tuple(map(read_csv, req_files))

def get_data_list(path):
    train_df, structures, mulliken, potential = read_csvs(path)
    train_df['type'] = train_df['type'].astype('category')
    train_df['type_c'] = train_df['type'].cat.codes
    structures['atom'] = structures['atom'].astype('category')
    structures['atom_c'] = structures['atom'].cat.codes
    return list(group_structures(train_df, structures, mulliken, potential).values())

In [None]:
def to_data(first):
    src, dst = first[0].atom_index_0, first[0].atom_index_1
    src, dst = np.concatenate((src, dst)), np.concatenate((dst, src))
    edge_idx = np.stack((src, dst))
    scalar_coupling = np.concatenate((first[0].scalar_coupling_constant, first[0].scalar_coupling_constant))
    edge_types = np.concatenate((first[0].type_c.values, first[0].type_c.values))
    xyz, atom = first[1].iloc[:,3:-1].values, first[1].iloc[:,-1].values
    mul_charge = first[2].iloc[:,-1].values
    print(first[3])
    data = Data(pos=torch.FloatTensor(xyz), 
                edge_index=torch.LongTensor(edge_idx), 
                edge_types=torch.LongTensor(edge_types),
                atom=torch.LongTensor(atom),
                charge=torch.FloatTensor(mul_charge),
                energy=torch.FloatTensor(first[3].potential_energy.values),
                batch_edge_index=torch.zeros(edge_types.shape, dtype=torch.long),
                scalar_coupling=torch.FloatTensor(scalar_coupling))
    return data

In [None]:
class Complete(object):
    def __init__(self):
        pass
    
    def __call__(self, data):
        complete_edges = np.array(list(itertools.permutations(range(data.num_nodes),2))).T
        data.edge_index = torch.LongTensor(complete_edges)
        return data

In [None]:
class Squeeze_Edge_Types(object):
    def __init__(self):
        pass
    
    def __call__(self, data):
        data.edge_types = data.edge_types.squeeze()
        return data

In [None]:
import torch
from torch_geometric.data import InMemoryDataset, Data
import torch_geometric.transforms as T

class MyOwnDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(MyOwnDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def raw_file_names(self):
        return ['structures.csv', 'mulliken_charges.csv', 'train.csv', 'magnetic_shielding_tensors.csv']
    
    @property
    def processed_file_names(self):
        return ['data.pt']
    
    def _download(self):
        pass
    
    def process(self):
        data_list = get_data_list(self.root)
        data_list = [to_data(data) for data in data_list]
        
        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]
        
        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]
            
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [None]:
!rm data/processed/data.pt

In [None]:
dataset = MyOwnDataset('data', transform=T.Compose([T.Distance()]))#, transform=Complete())

In [None]:
dataset[0]['scalar_coupling']

In [None]:
size = len(dataset)
train = int(.5 * size)
valid = int(.3 * size)

In [None]:
# Normalize targets to mean=0 and std=1
mean = dataset.data.energy.mean(dim=0, keepdim=True)
std = dataset.data.energy.std(dim=0, keepdim=True)
dataset.data.energy = (dataset.data.energy - mean) / std

In [None]:
# Normalize targets to mean=0 and std=1
mean = dataset.data.scalar_coupling.mean(dim=0, keepdim=True)
std = dataset.data.scalar_coupling.std(dim=0, keepdim=True)
dataset.data.scalar_coupling = (dataset.data.scalar_coupling - mean) / std

In [None]:
plt.plot(dataset.data.scalar_coupling)

In [None]:
train_mask = torch.FloatTensor(len(dataset)).uniform_() > 0.3

In [None]:
train_mask.sum() / float(train_mask.size(0))

In [None]:
train_dataset = dataset[train_mask]
valid_dataset = dataset[~train_mask]

## Simple Model

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import NNConv, Set2Set, GCNConv
from torch_geometric.data import DataLoader

from torch.nn import Sequential, Linear, ReLU, GRU, Embedding, LeakyReLU

In [None]:
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=100, shuffle=True)

In [None]:
dim = 64
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.lin0 = torch.nn.Linear(10, dim)
        
        self.pos_emb = Linear(3, 5)
        self.atom_emb = Embedding(5, 5)
        self.edge_emb = Embedding(8, 5)
        self.dist_emb = Linear(1, 5)
        nn = Sequential(Linear(10, 128), LeakyReLU(0.1), Linear(128, dim * dim))
        self.conv = NNConv(dim, dim, nn, aggr='mean')
        self.gru = GRU(dim, dim)

        self.set2set = Set2Set(dim, processing_steps=6)
        self.lin1 = torch.nn.Linear(2 * dim, dim)
        self.lin2 = torch.nn.Linear(dim, 1)

    def forward(self, data):
        pos = self.pos_emb(data.pos)
        atom_emb = self.atom_emb(data.atom)
        x = torch.cat((pos, atom_emb), dim=1)
        out = F.relu(self.lin0(x))
        h = out.unsqueeze(0)
        
        edge_emb = self.edge_emb(data.edge_types)
        edge_dist = self.dist_emb(data.edge_attr)
        edge_attr = torch.cat((edge_emb, edge_dist), dim=1)

        for i in range(6):
            m = F.relu(self.conv(out, data.edge_index, edge_attr))
            out, h = self.gru(m.unsqueeze(0), h)
            out = out.squeeze(0)
            
        x = torch.index_select(out, 0, data.edge_index.T.contiguous().view(-1))
        x = x.view((data.edge_index.shape[1], -1))

        #out = self.set2set(out, data.batch)
        out = F.relu(self.lin1(x))
        out = self.lin2(out)
        return out.view(-1)

In [None]:
net = Net()

In [None]:
b = next(iter(train_loader))

In [None]:
b.batch_edge_index.shape

In [None]:
net(b)

In [None]:
from fastprogress.fastprogress import master_bar, progress_bar
from collections import deque

In [None]:
optimizer = torch.optim.Adam(net.parameters(), lr=1e-5, weight_decay=1e-5)
net.train()
net = net.to('cuda')

mb = master_bar(range(50))
losses = deque(maxlen=400)
v_losses = deque(maxlen=400)
idx = 0
tv_loader = list(zip())
num_train = train_mask.sum().item()
for epoch in mb:
    e_losses = []
    for batch in progress_bar(train_loader, parent=mb):
        optimizer.zero_grad()
        batch = batch.to('cuda')
        out = net(batch)
        loss = F.mse_loss(out, batch.scalar_coupling)
        loss.backward()
        optimizer.step()
        mb.child.comment = f'Loss = {loss.item()}'
        e_losses.append([loss.item() * batch.num_graphs])
        losses.append(loss.item())
        mb.update_graph([[range(len(losses)), losses],
                         [range(len(v_losses)), v_losses]])
    
    # Validate
    val_loss = []
    cnt = 0
    with torch.no_grad():
        for b in progress_bar(valid_loader, parent=mb):
            pred = net(b.to('cuda'))
            loss = F.mse_loss(pred, b.scalar_coupling)
            v_losses.append(loss.item())
            val_loss.append(loss.item() * b.num_graphs)
            mb.update_graph([[range(len(losses)), losses],
                             [range(len(v_losses)), v_losses]])
            cnt += b.num_graphs
    valid_loss = np.sum(val_loss) / cnt
    
    mb.write(f'Finished epoch {epoch}, Loss: {np.sum(e_losses) / num_train}, Valid Loss: {valid_loss}')

In [None]:
torch.save(net, 'model_0.0360_0.03757.pt')