# Active-learning tutorial: Using committee MACE models to study protonated water clusters

1. Load all the modules
2. read the training pool 
3. select random training set of 25 structure from the pool (can be done with np.rand) --> latter exclude these from the pool
4. Train a committee (just to check we can train 2)
5. predict on the training pool and sort max energy error
6. Then we repeat in a for loop.

## To Do 

- for loop everywhere
- avoid using scripts for MACE
- fix E0s

In [None]:
from IPython.display import Image, display
display(Image(filename='../initial-datasets/zundel/zundel.png'))

## Import modules

In [3]:
import os, sys
import multiprocessing
from pathlib import Path
from tqdm.notebook import tqdm

import numpy as np
import matplotlib.pyplot as plt

from ase.io import read, write # read and write structures
# from ase.visualize import view # visualize structures (optional)

# import functions to run this tutorial
from myfunctions import train_mace     # train MACE model
from myfunctions import eval_mace      # evaluate MACE model
from myfunctions import extxyz2energy  # extract energy from extxyz file
from myfunctions import run_qbc        # run Query by Committee

In [4]:
np.random.seed(0)
plt.style.use('notebook.mplstyle')
os.makedirs('config', exist_ok=True)
os.makedirs('models', exist_ok=True)
os.makedirs('structures', exist_ok=True)

In [5]:
n_init_train = 20
n_test = 50  
n_committee = 4
parallel = False
qbc_folder = "qbc-work" # if you modify this, add the new folder to .gitignore

## Select initial training structures

In [6]:
# Read the all the structures from file
structures = read('../initial-datasets/zundel/train.extxyz', index=':')
print(f'Total number of structures: {len(structures)}')
# view(structures)  # Opens an interactive GUI window to visualize the structures

Total number of structures: 1000


In [7]:
# Create the initial training and test sets
selected_indices = np.random.choice(len(structures), size=(n_init_train + n_test), replace=False)
remaining_candidate_idcs = np.delete(np.arange(len(structures)), selected_indices)

indices_train = selected_indices[:n_init_train]
indices_test = selected_indices[n_init_train:]
assert len(indices_train) == n_init_train
assert len(indices_test) == n_test

print(f'\nSelected indices for training: {indices_train}')
print(f'\nSelected indices for test: {indices_test}')

initial_training_set = [structures[i] for i in indices_train]
test_set = [structures[i] for i in indices_test]
remaining_structures = [structures[i] for i in remaining_candidate_idcs]

print(f"\nSaving the initial training set to 'structures/init.train.extxyz'")
write('structures/init.train.extxyz', initial_training_set, format='extxyz')

print(f"\nSaving the test set to 'structures/test.extxyz'")
write('structures/test.extxyz', test_set, format='extxyz')

print(f"\nSaving the remaining structures to 'structures/remaining.extxyz'")
write('structures/remaining.extxyz', remaining_structures, format='extxyz')


Selected indices for training: [993 859 298 553 672 971  27 231 306 706 496 558 784 239 578  55 906 175
  14  77]

Selected indices for test: [ 31 481 310 311 883 788  45 103 760   1 823 710 614 790 408 736 957 366
 918 267 230 996 635 698 251 783 819 141 316 587 331 295 262 432 862 582
 272 270 987 319 569 643 142 202 413 196 264 531 252 576]

Saving the initial training set to 'structures/init.train.extxyz'

Saving the test set to 'structures/test.extxyz'

Saving the remaining structures to 'structures/remaining.extxyz'


## Initial Training

Hyperparameters for the committee members

In [8]:
# Define different values for each config
os.makedirs('config', exist_ok=True)
seeds = np.random.randint(0, 2**32 - 1, size=n_committee, dtype=np.uint32)
for i in range(n_committee):
    filename = f"config/config.{i}.yml"
    name = f"mace.com={i}"
    
    config_text = f"""
# You can modify the following parameters
num_channels: 16
max_L: 0            # take it larger but not smaller
max_ell: 1          # take it larger but not smaller
correlation: 1      # take it larger but not smaller
num_interactions: 2 # take it larger but not smaller

# ... but you can also modify these ones
r_max: 4.0
batch_size: 4
max_num_epochs: 100

# But please, do not modify these parameters!
model: "MACE"
name: "{name}"
model_dir: "models"
log_dir: "log"
checkpoints_dir: "checkpoints"
results_dir: "results"
train_file: "structures/init.train.extxyz"
energy_key: "REF_energy"
forces_key: "REF_forces"
E0s: "average" # to be fixed
device: cpu
swa: true
seed: {seeds[i]}
restart_latest: False
"""

    with open(filename, "w") as f:
        f.write(config_text)

    print(f"Wrote {filename}")

Wrote config/config.0.yml
Wrote config/config.1.yml
Wrote config/config.2.yml
Wrote config/config.3.yml


In [None]:
# train a committee of MACE models
os.makedirs('models', exist_ok=True)
parallel = True
if parallel: # serial version: it should take around 25s 
   
    def train_single_model(n):
        config_path = f"config/config.{n}.yml"
        train_mace(config_path)
        
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        pool.map(train_single_model, range(n_committee))
        
else: # serial version: it should take around 1m
    for n in range(n_committee):
        train_mace(f"config/config.{n}.yml")

Train a committee of MACE models.

In [None]:
# remove useless files
for filename in os.listdir('log'):
    if filename.endswith('_debug.log'):
        file_path = os.path.join('log', filename)
        os.remove(file_path)
        
for n in range(n_committee):
    
    # models
    filenames = [f"models/mace.com={n}.model",
                 f"models/mace.com={n}_compiled.model",
                 f"models/mace.com={n}_stagetwo.model"]
    for filename in filenames:
        if os.path.exists(filename):
            os.remove(filename)
    
    if os.path.exists(f"models/mace.com={n}_stagetwo_compiled.model"):
        os.rename(f"models/mace.com={n}_stagetwo_compiled.model",f"models/mace.n={n}.model")
    
for filename in os.listdir('results'):
    if filename.endswith('.txt') or filename.endswith('stage_one.png'):
        file_path = os.path.join('results', filename)
        os.remove(file_path)

## Evaluation

In [None]:
for n in tqdm(range(n_committee)):
    eval_mace(f'models/mace.n={n:d}.model', '../initial-datasets/zundel/train.extxyz', f'eval_train_{n:02d}.extxyz')

In [None]:
# read in predicted energies
energies = np.array([extxyz2energy(f'eval_train_{n:02d}.extxyz') for n in tqdm(range(n_committee))])

In [None]:
avg_energy = energies.mean(axis=0)
disagreement = energies.std(axis=0)

In [None]:
for n, e in enumerate(energies):
    plt.plot(e, label=rf'$E_{n:d}$', alpha=0.5)
plt.plot(avg_energy, label=r'$\overline{E}$', color='k')
plt.legend()
plt.xlabel('Data point index')
plt.ylabel('Energy [eV]');

In [None]:
plt.plot(disagreement)
plt.xlabel('Data point index')
plt.ylabel(r'$\sigma(E)$ [eV]');

# Select relevant training data via Query by Committee (QbC)

Some text...

In [9]:
# Define different values for each config
# TODO: make this simpler - the only thing we need to change is the name of the training extxyz file.
# TODO: implement retraining using the refinement workflow using `foundation_model`
os.makedirs(qbc_folder, exist_ok=True)
os.makedirs(f'{qbc_folder}/config', exist_ok=True)
seeds = np.random.randint(0, 2**32 - 1, size=n_committee, dtype=np.uint32)
for i in range(n_committee):
    filename = f"{qbc_folder}/config/config.{i}.yml"
    name = f"mace.com={i}"
    
    config_text = f"""
# You can modify the following parameters
num_channels: 16
max_L: 0            # take it larger but not smaller
max_ell: 1          # take it larger but not smaller
correlation: 1      # take it larger but not smaller
num_interactions: 2 # take it larger but not smaller

# ... but you can also modify these ones
r_max: 4.0
batch_size: 4
max_num_epochs: 100

# But please, do not modify these parameters!
model: "MACE"
name: "{name}"

model_dir      : "{qbc_folder}/models"
log_dir        : "{qbc_folder}/log"
checkpoints_dir: "{qbc_folder}/checkpoints"
results_dir    : "{qbc_folder}/results"

train_file: "{qbc_folder}/train-iter.extxyz"
energy_key: "REF_energy"
forces_key: "REF_forces"

E0s: "average" # to be fixed
device: cpu
swa: true
seed: {seeds[i]}
restart_latest: True

"""

    with open(filename, "w") as f:
        f.write(config_text)

    print(f"Wrote {filename}")

Wrote qbc-work/config/config.0.yml
Wrote qbc-work/config/config.1.yml
Wrote qbc-work/config/config.2.yml
Wrote qbc-work/config/config.3.yml


In [None]:
fns_committee = [f'models/mace.n={n:d}.model' for n in range(n_committee)]
run_qbc(
    fns_committee=fns_committee,                  # list of MACE models
    fn_candidates='structures/remaining.extxyz',  # candidate structures
    # fn_train_init='structures/init.train.extxyz', # initial training set
    n_iter=5,                                     # number of QBC iterations
    config=f'{qbc_folder}/config',                # folder with config files
    ofolder=qbc_folder,                           # folder to save the QBC results
    n_add_iter=10,                                # number of structures to add in each iteration
    recalculate_selected=False,                   # whether to recalculate the selected structures
);

Starting QbC.
5 iterations will be done in total and 10 will be added every iteration.


  0%|          | 0/5 [00:00<?, ?it/s]

Predicting committee disagreement across the candidate pool.


  model = torch.load(f=args.model, map_location=args.device)


In [None]:
sigma = np.loadtxt('disagreement.txt').T

In [None]:
plt.plot(sigma[0], '-o', label='Selected')
plt.plot(sigma[1], '-o', label='Candidates')
plt.legend()
plt.xlabel('QbC iteration')
plt.ylabel(r'$\sigma(E) [eV]$')

## Run FHI-aims

In [None]:
from myfunctions import run_aims

In [None]:
to_run  = structures[:4]

In [None]:
%%capture
run_aims(
    structures=to_run,
    folder='aims',
    command=f"mpirun -n 4 /home/stoccoel/codes/FHIaims-polarization/build/polarization-debug/aims.250131.scalapack.mpi.x",
    control="../aims/control.in"
)