In [1]:
"""
Test DISCOVER algorithm.

- create distance matrix
- apply densMAP
- create clusters via HDBSCAN*
- search for interesting materials, for example:
     - high-target/low-density
     - materials with high-target surrounded by materials with low targets
     - high mean cluster target/high fraction of validation points within cluster

Run using discover environment.

# Stick with ~10k elasticity datapoints
# Perform UMAP and HDBSCAN
for cluster in clusters:
    for i in [0, 1, 5, 10]:
        n = ceil(len(cluster) - i, 0)
        clustertmp = clusters
        remove cluster[0:n] from clustertmp
        train crabnet
        predict on removed cluster, store MAE
        calculate MAE for targets above some threshold
        test distance to pareto front for "kept" (?) cluster points for various metrics
        calculate the distribution of distances to the pareto front
# FIGURE: cluster-wise distributions of target values (6 clusters with highest (mean?) target value) 2x3 tile
# TODO: parameter - highest mean vs. highest single target value
# SUPP-FIGURE: cluster-wise distributions of target values for all clusters (Nx3)
# FIGURE: cluster-wise cross-val parity plots (6 clusters with highest (mean?) target value) - 2x3 tile
# SUPP-FIGURE: cluster-wise cross-val parity plots for all clusters (Nx3)

Created on Mon Sep 6 23:15:27 2021.

@author: sterg
"""
# %% Setup
# imports
from os.path import join
import pandas as pd
from sklearn.model_selection import train_test_split
from mat_discover_ import Discover
from mat_discover.utils.Timer import Timer

dummy_run = False
disc = Discover(dummy_run=dummy_run)

# load validation data
# HACK: absolute path while stick working out dependency structure
data_dir = join("CrabNet", "data", "materials_data", "elasticity")
name = "train.csv"  # "example_materials_property_val_output.csv", #elasticity_val_output.csv"
fpath = join(data_dir, name)
df = pd.read_csv(fpath)

# df = df.groupby(by="formula", as_index=False).mean()
# if there are two compounds with the same formula, we're more interested in the higher GPa
group_filter = "max"  # "mean"
grp_df = (
    df.reset_index()
    .groupby(by="formula")
    .agg({"index": lambda x: tuple(x), "target": "max"})
    .reset_index()
)

# REVIEW: drop pure elements here?

# take small subset
if dummy_run:
    n = 100
    n2 = 10
    train_df = grp_df.iloc[:n, :]
    val_df = grp_df.iloc[n : n + n2, :]
else:
    train_df, val_df = train_test_split(grp_df, train_size=0.8)

In [5]:
cat_df = pd.concat((train_df, val_df), axis=0)
with Timer("DISCOVER-group-cross-val"):
    disc.group_cross_val(cat_df)
print("scaled test error = ", disc.scaled_error)

Fitting mod_petti kernel matrix
Constructing distances
[fit-wasserstein]
Elapsed: 17.36815




using precomputed metric; transform will be unavailable for new data and inverse_transform will be unavailable for all data



[fit-UMAP]
Elapsed: 26.00846

[HDBSCAN*]
Elapsed: 0.1486

loading data with up to 6 elements in the formula
training with batchsize 512 (2**9.000)
loading data with up to 6 elements in the formula
stepping every 200 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/40 --- train mae: 53.8 val mae: 51.9
Epoch: 19/40 --- train mae: 12.9 val mae: 15.3
Epoch: 39/40 --- train mae: 9.94 val mae: 14.1
Saving network (test-property) to models/trained_models/test-property.pth
loading data with up to 6 elements in the formula
loading data with up to 6 elements in the formula
loading data with up to 6 elements in the formula
training with batchsize 512 (2**9.000)
loading data with up to 6 elements in the formula
stepping every 200 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/40 --- train mae: 54 val mae: 45.2
Epoch: 19/40 --- train mae: 11.9 val mae: 15
Epoch: 39/40 --- train mae: 9.13 val mae: 13.6
Sa

In [6]:
# from os.path import join, expanduser
# "~",
# "Documents",
# "GitHub",
# "sparks-baird",
# "ElM2D",

In [8]:
disc = Discover(dummy_run=dummy_run)

In [9]:
# slower if umap_random_state is not None
with Timer("DISCOVER-fit"):
    disc.fit(train_df)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Model architecture: out_dims, d_model, N, heads
3, 512, 3, 4
Running on compute device: cuda:0
Model size: 11987206 parameters



Generating EDM: 100%|██████████| 8572/8572 [00:00<00:00, 159073.24formulae/s]


loading data with up to 6 elements in the formula
training with batchsize 512 (2**9.000)
stepping every 170 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/40 --- train mae: 53.8 val mae: 53.8
Epoch: 19/40 --- train mae: 12.5 val mae: 12.5
Epoch: 39/40 --- train mae: 9.4 val mae: 9.4
Saving network (test-property) to models/trained_models/test-property.pth
[train-CrabNet]
Elapsed: 88.98014

[DISCOVER-fit]
Elapsed: 88.98014

