In [None]:
import os
import sys
import gc; gc.enable()
import warnings; warnings.filterwarnings("ignore")

import pickle
from tqdm import tqdm

import numpy as np
import pandas as pd
import seaborn as sns
from tabulate import tabulate
from matplotlib import pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
#from torch.utils.data import Dataset, DataLoader

import torch_geometric
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_max_pool
from torch_geometric.loader import DataLoader

from openbabel import pybel
pybel.ob.obErrorLog.SetOutputLevel(0)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score
from ase import Atoms

from nice.blocks import *
from nice.utilities import *
from tqdm import tqdm

In [None]:
tmp = Atoms(positions = np.random.rand(5, 3), numbers =  [1, 1, 1, 6, 6])
print(tmp)

In [None]:
data = pd.read_csv('../data/train.csv', index_col=0)
test = pd.read_csv('../data/test.csv', index_col=0)

## build graphs

In [None]:
from pybel_molecule import build_molecules_pybel

data_molecules = build_molecules_pybel(data['Smiles'].values)
data_targets = data['Active'].values.astype(np.int64)
for index in range(len(data_molecules)):
    data_molecules[index].y = int(data['Active'].values[index])

test_molecules = build_molecules_pybel(test['Smiles'].values, fixed=True)

In [None]:
def get_min_distance(atoms):
    def get_distance(first, second):
        delta = first - second
        return np.sum(np.sqrt(delta * delta))
    positions = atoms.positions
    min_d = None
    for i in range(len(positions)):
        for j in range(i + 1, len(positions)):
            now = get_distance(positions[i], positions[j])
            if (min_d is None) or (now < min_d):
                min_d = now
    return min_d

In [None]:
data_ase = [molecule.get_ase() for molecule in tqdm(data_molecules)]
test_ase = [molecule.get_ase() for molecule in tqdm(test_molecules)]

In [None]:
data_distance = [get_min_distance(molecule) for molecule in tqdm(data_ase)]
test_distance =  [get_min_distance(molecule) for molecule in tqdm(test_ase)]

In [None]:
print(np.min(data_distance))
data_bad_mask =  np.array(data_distance) < 0.5
data_good_mask = np.logical_not(data_bad_mask)
print(np.sum(data_bad_mask))
print(np.sum(data_good_mask))

print(np.min(test_distance))
test_bad_mask = np.array(test_distance) < 0.5
test_good_mask = np.logical_not(test_bad_mask)

print(np.sum(test_bad_mask))
print(np.sum(test_good_mask))


In [None]:
data_ase = [data_ase[i] for i in range(len(data_good_mask)) if data_good_mask[i]]
data_molecules = [data_molecules[i]  for i in range(len(data_good_mask)) if data_good_mask[i]]
data = data[data_good_mask]



In [None]:
data_graphs = [molecule.get_graph() for molecule in tqdm(data_molecules)]
test_graphs = [molecule.get_graph() for molecule in tqdm(test_molecules)]
for index in range(len(data_graphs)):
    data_graphs[index].y = int(data['Active'].values[index])

In [None]:
print(len(data_graphs))
print(len(data_molecules))
print(data['Active'].values.shape)

In [None]:
HYPERS = {
    'interaction_cutoff': 4.3,
    'max_radial': 5,
    'max_angular': 5,
    'gaussian_sigma_type': 'Constant',
    'gaussian_sigma_constant': 0.2,
    'cutoff_smooth_width': 0.3,
    'radial_basis': 'GTO'
}

In [None]:
def get_nice():
    return StandardSequence([
        StandardBlock(ThresholdExpansioner(num_expand=150),
                      None,
                      IndividualLambdaPCAsBoth(n_components=50),
                      ThresholdExpansioner(num_expand=300, mode='invariants'),
                     None,
                      InvariantsPCA(n_components=20)),
        StandardBlock(ThresholdExpansioner(num_expand=150),
                      None,
                      IndividualLambdaPCAsBoth(n_components=50),
                      ThresholdExpansioner(num_expand=300, mode='invariants'),
                      None,
                      InvariantsPCA(n_components=20)),
        StandardBlock(None, None, None,
                      ThresholdExpansioner(num_expand=300, mode='invariants'),
                      None,
                      InvariantsPCA(n_components=20))
    ],
                            initial_scaler=InitialScaler(
                                mode='signal integral', individually=True))

In [None]:
all_species = get_all_species(data_ase + test_ase)
print(all_species)

In [None]:
data_coefficients = get_spherical_expansion(data_ase, HYPERS,
                                             all_species, split_by_central_specie = False)

test_coefficients = get_spherical_expansion(test_ase, HYPERS,
                                            all_species, split_by_central_specie = False)

In [None]:
all_coefficients = np.concatenate([data_coefficients, test_coefficients], axis = 0)
print(all_coefficients.shape)

In [None]:
indices = np.random.permutation(all_coefficients.shape[0])
indices = indices[0:10000]
nice = get_nice()
nice.fit(all_coefficients[indices])

In [None]:
'''import time
begin = time.time()
data_features = nice.transform(data_coefficients, return_only_invariants = True)
test_features = nice.transform(test_coefficients, return_only_invariants = True)
print(time.time() - begin)'''

In [None]:
batch_size = 1000
data_features_raw = [nice.transform(data_coefficients[i : i + batch_size], return_only_invariants = True) 
                 for i in tqdm(range(0, data_coefficients.shape[0], batch_size))]


In [None]:
data_features = {}
for key in data_features_raw[0].keys():
    now = [el[key] for el in data_features_raw]
    data_features[key] = np.concatenate(now, axis = 0)
for key in data_features.keys():
    print(key, data_features[key].shape)
print(data_coefficients.shape)

for el in data_features:
    print(el, data_features[el].shape)
    

In [None]:
del data_coefficients

In [None]:
print(np.isnan(np.max(test_coefficients[0])))
print(np.isnan(np.max(test_coefficients)))
for i in range(len(test_coefficients)):
    if np.isnan(np.max(test_coefficients[i])):
        test_coefficients[i] = 0.0
        
print(np.isnan(np.max(test_coefficients[0])))
print(np.isnan(np.max(test_coefficients)))

In [None]:
for i in range(len(test_coefficients)):
    if np.sum(test_coefficients[i] ** 2) < 1e-10:
        test_coefficients[i, 0, 0, 0] = 0.00001

In [None]:
print(np.max(test_coefficients[0]))
print(np.max(test_coefficients))

In [None]:
a = np.array([[1,2,3], [3,4,5]])
a[1] = 0
print(a)

In [None]:
batch_size = 1000
test_features_raw = [nice.transform(test_coefficients[i : i + batch_size], return_only_invariants = True) 
                 for i in tqdm(range(0, test_coefficients.shape[0], batch_size))]

test_features = {}
for key in test_features_raw[0].keys():
    now = [el[key] for el in test_features_raw]
    test_features[key] = np.concatenate(now, axis = 0)
for key in test_features.keys():
    print(key, test_features[key].shape)
print(test_coefficients.shape)

for el in test_features:
    print(el, test_features[el].shape)

In [None]:
del test_coefficients

In [None]:
for graph in data_graphs:
    graph.initial_features = torch.FloatTensor(graph.x)
    
now = 0
for index in range(len(data_graphs)):
    features_now = torch.FloatTensor(data_features[2][now : now + data_graphs[index].x.shape[0]])
    #print(features_now.shape)
    data_graphs[index].x = torch.cat([data_graphs[index].initial_features, features_now], dim = 1)
    now += data_graphs[index].x.shape[0]
print(data_graphs[0].x.shape)
print(data_features[2].shape, now)

In [None]:
for graph in test_graphs:
    graph.initial_features = torch.FloatTensor(graph.x)
    
now = 0
for index in range(len(test_graphs)):
    features_now = torch.FloatTensor(test_features[2][now : now + test_graphs[index].x.shape[0]])
    #print(features_now.shape)
    test_graphs[index].x = torch.cat([test_graphs[index].initial_features, features_now], dim = 1)
    now += test_graphs[index].x.shape[0]
print(test_graphs[0].x.shape)
print(test_features[2].shape, now)

## engine

In [None]:
from utils import *

## model

In [None]:
from models import *

## args

In [None]:
# data split args
n_splits, random_state = 8, 42

# positive objects rebalance args
num_pos_repeats, pos_weight = 1, 12.0

# model args
num_features, width, depth = 60, 128, 2
device = torch.device('cuda:0')

# fit args
batch_size, num_workers = 128, 8
lr, num_epochs = 5e-4, 64

# name for logs and checkpoints
name = 'test'

## train

In [None]:
thrs, f1s, f1s_histories = [], [], []
for index in range(n_splits):
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    tmp = cv.split(data['Smiles'].values, data['Active'].values.astype(np.int64))

    for _ in range(index + 1):
        train_indices, val_indices = tmp.__next__()

    train_graphs = [data_graphs[index] for index in train_indices if not data_graphs[index].empty]
    val_graphs = [data_graphs[index] for index in val_indices if not data_graphs[index].empty]

    positive_train_graphs = [graph for graph in train_graphs if graph.y]
    if num_pos_repeats > 1:
        train_graphs = train_graphs + positive_train_graphs * (num_pos_repeats - 1)

    train_loader = DataLoader(train_graphs, shuffle=True, batch_size=batch_size, num_workers=num_workers)
    val_loader = DataLoader(val_graphs, shuffle=False, batch_size=batch_size, num_workers=num_workers)
    test_loader = DataLoader(test_graphs, shuffle=False, batch_size=batch_size, num_workers=num_workers)

    model = GCN(num_features, width=width, depth=depth)  # <- GCN defined in models.py, feel free to add more
    model.to(device)

    opt = torch.optim.Adam(model.parameters(), lr=lr)

    exp_name = name + '_' + str(index) + '_cv_split'
    
    # trainer defined in utils.py
    trainer = Trainer(model, opt, None, train_loader, val_loader, num_epochs,
                      weight=pos_weight, step='step', backup_by='all',
                      logs_path='./logs', path_to_save='./ckpt', exp_name=exp_name, verbose=0)
    trainer.run()

    thrs.append(trainer.thr)
    f1s.append(trainer.adaptive_f1s[trainer.best_epoch - 1])
    f1s_histories.append(trainer.adaptive_f1s)
    print(trainer.adaptive_f1s)
# raise Exception('done!')

In [None]:
f1s_histories = np.array(f1s_histories)
print(f1s_histories.shape)

In [None]:
print(np.mean(f1s_histories[:, 48:]))

In [None]:
for i in range(8):
    print(np.argmax(f1s_histories[i, :]))

In [None]:
model = GCN(num_features, width=width, depth=depth)
model.to(device)
test_loader = DataLoader(test_graphs, shuffle=False, batch_size=batch_size, num_workers=num_workers)
    
outputs_list = []

selection_dict = {(42, 0): (0, 63),
                  (42, 1): (0, 36),
                  (42, 2): (0, 42),
                  (42, 3): (0, 39),
                  (42, 4): (0, 53),
                  (42, 5): (0, 23),
                  (42, 6): (0, 32),
                  (42, 7): (0, 31)}

outputs_list = []
for index in tqdm(range(n_splits)):
    jndex, epoch = selection_dict[(random_state, index)]
    exp_name = name + '_' + str(index) + '_cv_split' + '_' + str(epoch) + '.pth'
    model.load_state_dict(torch.load(os.path.join('./ckpt', exp_name)))
    
    # inference defined in utils.py
    outputs = inference(model, test_loader)
    outputs_list.append(outputs)
    
outputs_list = np.asarray(outputs_list)

In [None]:
with open("outputs_list_second_run.pickle", 'wb') as f:
    pickle.dump(outputs_list, f)

In [None]:
tmp = outputs_list.copy()
tmp = np.vstack([tmp, np.mean(outputs_list, axis=0)[None]])
tmp = 1.0 / (1.0 + np.exp(-tmp))
for index in range(len(tmp)):
    indices = np.argsort(tmp[index])[::-1][:57]
    tmp[index] *= 0.0
    tmp[index, indices] = 1.0

corrs = np.zeros((len(tmp), len(tmp)), dtype=np.float)
for index in range(len(tmp)):
    for jndex in range(index, len(tmp)):
        corrs[index, jndex] = corrs[jndex, index] = np.mean(tmp[index] == tmp[jndex])
        
plt.figure(figsize=(10, 10))
labels = ['split ' + str(index) for index in range(n_splits)] + ['ensemble']
sns.heatmap(corrs, xticklabels=labels, yticklabels=labels, square=True, annot=True, fmt='.3f')
plt.title('logits cosine similarity')
plt.show()

In [None]:
probs = 1.0 / (1.0 + np.exp(-np.mean(outputs_list, axis=0)))
test['Active'] = False
pos_labels = np.argsort(probs)[::-1][0:59]
test['Active'][pos_labels] = True

test['Active'].to_csv('submission_nice_new.csv')
test['Active']

In [None]:
print(np.sort(probs)[-100:])