In [1]:
from torch_geometric.loader import DataLoader
from rdkit import Chem
import torch.nn.functional as F
import torch
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt
from tqdm import tqdm

import sys
sys.path.insert(0, '/home/shenwanxiang/Research/bidd-clsar/')

In [2]:
gpuid = 1
torch.cuda.set_device(gpuid)
print(torch.cuda.current_device())

1


In [3]:
from clsar.dataset import LSSNS, HSSMS  # dataset
from clsar.feature import Gen39AtomFeatures  # feature
from clsar.model.model import ACANet_GCN, ACANet_GIN, ACANet_GAT, ACANet_PNA, get_deg  # model

In [4]:
lr = 10**-4
epochs = 900
batch_size = 128
Dataset = HSSMS
pre_transform = Gen39AtomFeatures()
in_channels = pre_transform.in_channels


def train(train_loader, model, optimizer):
    model.train()
    total_loss = total_examples = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out, embed = model(data.x.float(), data.edge_index,
                           data.edge_attr, data.batch)
        loss = F.mse_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * data.num_graphs
        total_examples += data.num_graphs
    return sqrt(total_loss / total_examples)

@torch.no_grad()
def test(loader, model):
    mse = []
    model.eval()
    for data in loader:
        data = data.to(device)
        out, embed = model(data.x.float(), data.edge_index,
                           data.edge_attr, data.batch)
        mse.append(F.mse_loss(out, data.y, reduction='none').cpu())
    return float(torch.cat(mse, dim=0).mean().sqrt())

In [None]:
results = [] 
for dataset_name in tqdm(list(Dataset.names.keys())[14:]): # 
    print(dataset_name)
    #dataset_name = 'plk1'
    path = './tmpignore/data5'
    dataset = Dataset(path, name=dataset_name,
                      pre_transform=pre_transform).shuffle()

    # train, valid, test splitting
    N = len(dataset) // 5
    val_dataset = dataset[:N]
    test_dataset = dataset[N:2 * N]
    train_dataset = dataset[2 * N:]
    
    deg = get_deg(train_dataset)
    
    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 4 kind of models
    pub_args = {'in_channels': in_channels, 
                'convs_layers':[64, 128, 256], 
                'out_channels': 1, 'edge_dim': 10, }
    model_1 = ACANet_GCN(**pub_args).to(device)
    model_2 = ACANet_GIN(**pub_args, eps=0.0, train_eps=False).to(device)
    model_3 = ACANet_GAT(**pub_args, dropout=0.1, heads=3).to(device)
    model_4 = ACANet_PNA(**pub_args, aggregators=['mean', 'min', 'max', 'sum'],
                         scalers=['identity', 'amplification', 'attenuation'] ,
                         deg=deg).to(device)

    res = {}
    for model, name in zip([model_1,
                            model_2,
                            model_3,
                            model_4],
                           ['ACANet_GCN',
                            'ACANet_GIN',
                            'ACANet_GAT',
                            'ACANet_PNA']):

        print(name, '#'*50)


        optimizer = torch.optim.Adam(model.parameters(), lr=lr,
                                     weight_decay=10**-5)
        history = []
        for epoch in range(1, epochs):
            train_rmse = train(train_loader, model, optimizer)
            val_rmse = test(val_loader, model)
            test_rmse = test(test_loader, model)
            # print(f'Epoch: {epoch:03d}, Loss: {train_rmse:.4f} Val: {val_rmse:.4f} '
            #       f'Test: {test_rmse:.4f}')

            history.append({'Epoch': epoch, 'train_rmse': train_rmse,
                           'val_rmse': val_rmse, 'test_rmse': test_rmse})
        res.update({name: history})

    df = pd.DataFrame(res)

    def _get_score(x):
        df1 = x.apply(pd.Series)
        val_rmse = df1.iloc[df1.val_rmse.idxmin()].val_rmse
        test_rmse = df1.iloc[df1.val_rmse.idxmin()].test_rmse
        return val_rmse, test_rmse

    dfres = df.apply(_get_score, axis=0)
    dfres.index = ['val_rmse', 'test_rmse']
    dfres = dfres.unstack().reset_index()
    dfres.columns = ['model', 'metric', 'value']
    dfres['dataset'] = dataset_name

    results.append(dfres)

  0%|                                                                                            | 0/16 [00:00<?, ?it/s]

chembl2835_ki
ACANet_GCN ##################################################


In [None]:
dfp = pd.concat(results)
dfp.to_csv('./test/model_comparsion.csv' )

In [None]:
dfp = pd.read_csv('./test/model_comparsion.csv' )

In [None]:
dfp1 = dfp[dfp.metric == 'val_rmse']
dfp2 = dfp[dfp.metric == 'test_rmse']

In [None]:
order = dfp2.groupby('dataset').value.mean().sort_values().index.to_list()
hue_order = dfp2.groupby('model').value.mean().sort_values().index.to_list()

In [None]:
import seaborn as sns
sns.set(style='white',  font='sans-serif', font_scale=2)

color = sns.color_palette("rainbow", 4)  # PiYG
sns.palplot(color)

for dfps, label in zip([dfp1, dfp2], ['Val. RMSE', 'Test RMSE']):

    fig, ax = plt.subplots(figsize=(16, 6))
    sns.barplot(x='dataset', y='value', palette=color, order=order,
                hue='model', data=dfps, ax=ax,
                hue_order=hue_order)

    ax.set_ylabel(label)
    ax.set_xlabel('Dataset')
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    ax.set_xticklabels(order, rotation=45, ha='right')

    ax.tick_params(axis='y', left='off', labelleft='on', pad=.3,)